From 06a4607279d34da756ecf1369ebb27fd409caf73 Mon Sep 17 00:00:00 2001 From: VinPPP Date: Wed, 29 Jan 2025 05:27:54 +0000 Subject: [PATCH] deploy: 7b7f942af1ae56f816bee4f5e57e1f0c5f193e87 --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 34590 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 34985 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/cache.json b/cache.json new file mode 100644 index 0000000..fa4f138 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2025-01-21T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.09898v2","updated":"2025-01-21T18:46:52Z","published":"2025-01-17T01:01:44Z","title":"FoundationStereo: Zero-Shot Stereo Matching","summary":" Tremendous progress has been made in deep stereo matching to excel on\nbenchmark datasets through per-domain fine-tuning. However, achieving strong\nzero-shot generalization - a hallmark of foundation models in other computer\nvision tasks - remains challenging for stereo matching. We introduce\nFoundationStereo, a foundation model for stereo depth estimation designed to\nachieve strong zero-shot generalization. To this end, we first construct a\nlarge-scale (1M stereo pairs) synthetic training dataset featuring large\ndiversity and high photorealism, followed by an automatic self-curation\npipeline to remove ambiguous samples. We then design a number of network\narchitecture components to enhance scalability, including a side-tuning feature\nbackbone that adapts rich monocular priors from vision foundation models to\nmitigate the sim-to-real gap, and long-range context reasoning for effective\ncost volume filtering. Together, these components lead to strong robustness and\naccuracy across domains, establishing a new standard in zero-shot stereo depth\nestimation. Project page: https://nvlabs.github.io/FoundationStereo/\n","authors":["Bowen Wen","Matthew Trepte","Joseph Aribido","Jan Kautz","Orazio Gallo","Stan Birchfield"],"pdf_url":"https://arxiv.org/pdf/2501.09898v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12234v1","updated":"2025-01-21T15:57:15Z","published":"2025-01-21T15:57:15Z","title":"Multi-Agent Feedback Motion Planning using Probably Approximately\n Correct Nonlinear Model Predictive Control","summary":" For many tasks, multi-robot teams often provide greater efficiency,\nrobustness, and resiliency. However, multi-robot collaboration in real-world\nscenarios poses a number of major challenges, especially when dynamic robots\nmust balance competing objectives like formation control and obstacle avoidance\nin the presence of stochastic dynamics and sensor uncertainty. In this paper,\nwe propose a distributed, multi-agent receding-horizon feedback motion planning\napproach using Probably Approximately Correct Nonlinear Model Predictive\nControl (PAC-NMPC) that is able to reason about both model and measurement\nuncertainty to achieve robust multi-agent formation control while navigating\ncluttered obstacle fields and avoiding inter-robot collisions. Our approach\nrelies not only on the underlying PAC-NMPC algorithm but also on a terminal\ncost-function derived from gyroscopic obstacle avoidance. Through numerical\nsimulation, we show that our distributed approach performs on par with a\ncentralized formulation, that it offers improved performance in the case of\nsignificant measurement noise, and that it can scale to more complex dynamical\nsystems.\n","authors":["Mark Gonzales","Adam Polevoy","Marin Kobilarov","Joseph Moore"],"pdf_url":"https://arxiv.org/pdf/2501.12234v1.pdf","comment":"10 pages, 12 figures"},{"id":"http://arxiv.org/abs/2408.00275v3","updated":"2025-01-21T15:33:35Z","published":"2024-08-01T04:29:34Z","title":"A Search-to-Control Reinforcement Learning Based Framework for Quadrotor\n Local Planning in Dense Environments","summary":" Agile flight in complex environments poses significant challenges to current\nmotion planning methods, as they often fail to fully leverage the quadrotor's\ndynamic potential, leading to performance failures and reduced efficiency\nduring aggressive maneuvers. Existing approaches frequently decouple trajectory\noptimization from control generation and neglect the dynamics, further limiting\ntheir ability to generate aggressive and feasible motions. To address these\nchallenges, we introduce an enhanced Search-to-Control planning framework that\nintegrates visibility path searching with reinforcement learning (RL) control\ngeneration, directly accounting for dynamics and bridging the gap between\nplanning and control. Our method first extracts control points from\ncollision-free paths using a proposed heuristic search, which are then refined\nby an RL policy to generate low-level control commands for the quadrotor's\ncontroller, utilizing reduced-dimensional obstacle observations for efficient\ninference with lightweight neural networks. We validate the framework through\nsimulations and real-world experiments, demonstrating improved time efficiency\nand dynamic maneuverability compared to existing methods, while confirming its\nrobustness and applicability. To support further research, We will release our\nimplementation as an open-source package.\n","authors":["Zhaohong Liu","Wenxuan Gao","Yinshuai Sun","Peng Dong"],"pdf_url":"https://arxiv.org/pdf/2408.00275v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12214v1","updated":"2025-01-21T15:32:33Z","published":"2025-01-21T15:32:33Z","title":"Improving robot understanding using conversational AI: demonstration and\n feasibility study","summary":" Explanations constitute an important aspect of successful human robot\ninteractions and can enhance robot understanding. To improve the understanding\nof the robot, we have developed four levels of explanation (LOE) based on two\nquestions: what needs to be explained, and why the robot has made a particular\ndecision. The understandable robot requires a communicative action when there\nis disparity between the human s mental model of the robot and the robots state\nof mind. This communicative action was generated by utilizing a conversational\nAI platform to generate explanations. An adaptive dialog was implemented for\ntransition from one LOE to another. Here, we demonstrate the adaptive dialog in\na collaborative task with errors and provide results of a feasibility study\nwith users.\n","authors":["Shikhar Kumar","Yael Edan"],"pdf_url":"https://arxiv.org/pdf/2501.12214v1.pdf","comment":"40th Anniversary, IEEE International Conference on Robotics and\n Automation,2024"},{"id":"http://arxiv.org/abs/2501.12128v1","updated":"2025-01-21T13:42:06Z","published":"2025-01-21T13:42:06Z","title":"Evaluating Efficiency and Engagement in Scripted and LLM-Enhanced\n Human-Robot Interactions","summary":" To achieve natural and intuitive interaction with people, HRI frameworks\ncombine a wide array of methods for human perception, intention communication,\nhuman-aware navigation and collaborative action. In practice, when encountering\nunpredictable behavior of people or unexpected states of the environment, these\nframeworks may lack the ability to dynamically recognize such states, adapt and\nrecover to resume the interaction. Large Language Models (LLMs), owing to their\nadvanced reasoning capabilities and context retention, present a promising\nsolution for enhancing robot adaptability. This potential, however, may not\ndirectly translate to improved interaction metrics. This paper considers a\nrepresentative interaction with an industrial robot involving approach,\ninstruction, and object manipulation, implemented in two conditions: (1) fully\nscripted and (2) including LLM-enhanced responses. We use gaze tracking and\nquestionnaires to measure the participants' task efficiency, engagement, and\nrobot perception. The results indicate higher subjective ratings for the LLM\ncondition, but objective metrics show that the scripted condition performs\ncomparably, particularly in efficiency and focus during simple tasks. We also\nnote that the scripted condition may have an edge over LLM-enhanced responses\nin terms of response latency and energy consumption, especially for trivial and\nrepetitive interactions.\n","authors":["Tim Schreiter","Jens V. Rüppel","Rishi Hazra","Andrey Rudenko","Martin Magnusson","Achim J. Lilienthal"],"pdf_url":"https://arxiv.org/pdf/2501.12128v1.pdf","comment":"Accepted as a Late-Breaking Report to the 2025, 20th ACM/IEEE\n International Conference on Human-Robot Interaction (HRI)"},{"id":"http://arxiv.org/abs/2501.12073v1","updated":"2025-01-21T11:59:07Z","published":"2025-01-21T11:59:07Z","title":"Towards autonomous photogrammetric forest inventory using a lightweight\n under-canopy robotic drone","summary":" Drones are increasingly used in forestry to capture high-resolution remote\nsensing data. While operations above the forest canopy are already highly\nautomated, flying inside forests remains challenging, primarily relying on\nmanual piloting. Inside dense forests, reliance on the Global Navigation\nSatellite System (GNSS) for localization is not feasible. Additionally, the\ndrone must autonomously adjust its flight path to avoid collisions. Recently,\nadvancements in robotics have enabled autonomous drone flights in GNSS-denied\nobstacle-rich areas. In this article, a step towards autonomous forest data\ncollection is taken by building a prototype of a robotic under-canopy drone\nutilizing state-of-the-art open-source methods and validating its performance\nfor data collection inside forests. The autonomous flight capability was\nevaluated through multiple test flights in two boreal forest test sites. The\ntree parameter estimation capability was studied by conducting diameter at\nbreast height (DBH) estimation using onboard stereo camera data and\nphotogrammetric methods. The prototype conducted flights in selected\nchallenging forest environments, and the experiments showed excellent\nperformance in forest reconstruction with a miniaturized stereoscopic\nphotogrammetric system. The stem detection algorithm managed to identify 79.31\n% of the stems. The DBH estimation had a root mean square error (RMSE) of 3.33\ncm (12.79 %) and a bias of 1.01 cm (3.87 %) across all trees. For trees with a\nDBH less than 30 cm, the RMSE was 1.16 cm (5.74 %), and the bias was 0.13 cm\n(0.64 %). When considering the overall performance in terms of DBH accuracy,\nautonomy, and forest complexity, the proposed approach was superior compared to\nmethods proposed in the scientific literature. Results provided valuable\ninsights into autonomous forest reconstruction using drones, and several\nfurther development topics were proposed.\n","authors":["Väinö Karjalainen","Niko Koivumäki","Teemu Hakala","Jesse Muhojoki","Eric Hyyppä","Anand George","Juha Suomalainen","Eija Honkavaara"],"pdf_url":"https://arxiv.org/pdf/2501.12073v1.pdf","comment":"35 pages, 13 Figures"},{"id":"http://arxiv.org/abs/2310.03505v2","updated":"2025-01-21T10:37:18Z","published":"2023-10-05T12:35:09Z","title":"RadaRays: Real-time Simulation of Rotating FMCW Radar for Mobile\n Robotics via Hardware-accelerated Ray Tracing","summary":" RadaRays allows for the accurate modeling and simulation of rotating FMCW\nradar sensors in complex environments, including the simulation of reflection,\nrefraction, and scattering of radar waves. Our software is able to handle large\nnumbers of objects and materials in real-time, making it suitable for use in a\nvariety of mobile robotics applications. We demonstrate the effectiveness of\nRadaRays through a series of experiments and show that it can more accurately\nreproduce the behavior of FMCW radar sensors in a variety of environments,\ncompared to the ray casting-based lidar-like simulations that are commonly used\nin simulators for autonomous driving such as CARLA. Our experiments\nadditionally serve as a valuable reference point for researchers to evaluate\ntheir own radar simulations. By using RadaRays, developers can significantly\nreduce the time and cost associated with prototyping and testing FMCW\nradar-based algorithms. We also provide a Gazebo plugin that makes our work\naccessible to the mobile robotics community.\n","authors":["Alexander Mock","Martin Magnusson","Joachim Hertzberg"],"pdf_url":"https://arxiv.org/pdf/2310.03505v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12025v1","updated":"2025-01-21T10:34:04Z","published":"2025-01-21T10:34:04Z","title":"Low-Cost 3D printed, Biocompatible Ionic Polymer Membranes for Soft\n Actuators","summary":" Ionic polymer actuators, in essence, consist of ion exchange polymers\nsandwiched between layers of electrodes. They have recently gained recognition\nas promising candidates for soft actuators due to their lightweight nature,\nnoise-free operation, and low-driving voltages. However, the materials\ntraditionally utilized to develop them are often not human/environmentally\nfriendly. Thus, to address this issue, researchers have been focusing on\ndeveloping biocompatible versions of this actuator. Despite this, such\nactuators still face challenges in achieving high performance, in payload\ncapacity, bending capabilities, and response time. In this paper, we present a\nbiocompatible ionic polymer actuator whose membrane is fully 3D printed\nutilizing a direct ink writing method. The structure of the printed membranes\nconsists of biodegradable ionic fluid encapsulated within layers of activated\ncarbon polymers. From the microscopic observations of its structure, we\nconfirmed that the ionic polymer is well encapsulated. The actuators can\nachieve a bending performance of up to 124$^\\circ$ (curvature of 0.82\n$\\text{cm}^{-1}$), which, to our knowledge, is the highest curvature attained\nby any bending ionic polymer actuator to date. It can operate comfortably up to\na 2 Hz driving frequency and can achieve blocked forces of up to 0.76 mN. Our\nresults showcase a promising, high-performing biocompatible ionic polymer\nactuator, whose membrane can be easily manufactured in a single step using a\nstandard FDM 3D printer. This approach paves the way for creating customized\ndesigns for functional soft robotic applications, including human-interactive\ndevices, in the near future.\n","authors":["Nils Trümpler","Ryo Kanno","Niu David","Anja Huch","Pham Huy Nguyen","Maksims Jurinovs","Gustav Nyström","Sergejs Gaidukovs","Mirko Kovac"],"pdf_url":"https://arxiv.org/pdf/2501.12025v1.pdf","comment":"6 pages, 8 figures, Accepted in IEEE International Conference on Soft\n Robotics 2025 (Robosoft)"},{"id":"http://arxiv.org/abs/2307.09105v3","updated":"2025-01-21T09:41:04Z","published":"2023-07-18T09:54:01Z","title":"Sampling-based Model Predictive Control Leveraging Parallelizable\n Physics Simulations","summary":" We present a method for sampling-based model predictive control that makes\nuse of a generic physics simulator as the dynamical model. In particular, we\npropose a Model Predictive Path Integral controller (MPPI), that uses the\nGPU-parallelizable IsaacGym simulator to compute the forward dynamics of a\nproblem. By doing so, we eliminate the need for explicit encoding of robot\ndynamics and contacts with objects for MPPI. Since no explicit dynamic modeling\nis required, our method is easily extendable to different objects and robots\nand allows one to solve complex navigation and contact-rich tasks. We\ndemonstrate the effectiveness of this method in several simulated and\nreal-world settings, among which mobile navigation with collision avoidance,\nnon-prehensile manipulation, and whole-body control for high-dimensional\nconfiguration spaces. This method is a powerful and accessible open-source tool\nto solve a large variety of contact-rich motion planning tasks.\n","authors":["Corrado Pezzato","Chadi Salmi","Elia Trevisan","Max Spahn","Javier Alonso-Mora","Carlos Hernández Corbato"],"pdf_url":"https://arxiv.org/pdf/2307.09105v3.pdf","comment":"Accepted for RA-L. Code and videos available at\n https://autonomousrobots.nl/paper_websites/isaac-mppi"},{"id":"http://arxiv.org/abs/2410.06052v4","updated":"2025-01-21T08:48:08Z","published":"2024-10-08T13:54:04Z","title":"Concurrent-Learning Based Relative Localization in Shape Formation of\n Robot Swarms (Extended version)","summary":" In this paper, we address the shape formation problem for massive robot\nswarms in environments where external localization systems are unavailable.\nAchieving this task effectively with solely onboard measurements is still\nscarcely explored and faces some practical challenges. To solve this\nchallenging problem, we propose the following novel results. Firstly, to\nestimate the relative positions among neighboring robots, a concurrent-learning\nbased estimator is proposed. It relaxes the persistent excitation condition\nrequired in the classical ones such as least-square estimator. Secondly, we\nintroduce a finite-time agreement protocol to determine the shape location.\nThis is achieved by estimating the relative position between each robot and a\nrandomly assigned seed robot. The initial position of the seed one marks the\nshape location. Thirdly, based on the theoretical results of the relative\nlocalization, a novel behavior-based control strategy is devised. This strategy\nnot only enables adaptive shape formation of large group of robots but also\nenhances the observability of inter-robot relative localization. Numerical\nsimulation results are provided to verify the performance of our proposed\nstrategy compared to the state-of-the-art ones. Additionally, outdoor\nexperiments on real robots further demonstrate the practical effectiveness and\nrobustness of our methods.\n","authors":["Jinhu Lü","Kunrui Ze","Shuoyu Yue","Kexin Liu","Wei Wang","Guibin Sun"],"pdf_url":"https://arxiv.org/pdf/2410.06052v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11945v1","updated":"2025-01-21T07:41:12Z","published":"2025-01-21T07:41:12Z","title":"Learning to Hop for a Single-Legged Robot with Parallel Mechanism","summary":" This work presents the application of reinforcement learning to improve the\nperformance of a highly dynamic hopping system with a parallel mechanism.\nUnlike serial mechanisms, parallel mechanisms can not be accurately simulated\ndue to the complexity of their kinematic constraints and closed-loop\nstructures. Besides, learning to hop suffers from prolonged aerial phase and\nthe sparse nature of the rewards. To address them, we propose a learning\nframework to encode long-history feedback to account for the under-actuation\nbrought by the prolonged aerial phase. In the proposed framework, we also\nintroduce a simplified serial configuration for the parallel design to avoid\ndirectly simulating parallel structure during the training. A torque-level\nconversion is designed to deal with the parallel-serial conversion to handle\nthe sim-to-real issue. Simulation and hardware experiments have been conducted\nto validate this framework.\n","authors":["Hongbo Zhang","Xiangyu Chu","Yanlin Chen","Yunxi Tang","Linzhu Yue","Yun-Hui Liu","Kwok Wai Samuel Au"],"pdf_url":"https://arxiv.org/pdf/2501.11945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11938v1","updated":"2025-01-21T07:27:14Z","published":"2025-01-21T07:27:14Z","title":"Navigating Robot Swarm Through a Virtual Tube with Flow-Adaptive\n Distribution Control","summary":" With the rapid development of robot swarm technology and its diverse\napplications, navigating robot swarms through complex environments has emerged\nas a critical research direction. To ensure safe navigation and avoid potential\ncollisions with obstacles, the concept of virtual tubes has been introduced to\ndefine safe and navigable regions. However, current control methods in virtual\ntubes face the congestion issues, particularly in narrow virtual tubes with low\nthroughput. To address these challenges, we first originally introduce the\nconcepts of virtual tube area and flow capacity, and develop an new evolution\nmodel for the spatial density function. Next, we propose a novel control method\nthat combines a modified artificial potential field (APF) for swarm navigation\nand density feedback control for distribution regulation, under which a\nsaturated velocity command is designed. Then, we generate a global velocity\nfield that not only ensures collision-free navigation through the virtual tube,\nbut also achieves locally input-to-state stability (LISS) for density tracking\nerrors, both of which are rigorously proven. Finally, numerical simulations and\nrealistic applications validate the effectiveness and advantages of the\nproposed method in managing robot swarms within narrow virtual tubes.\n","authors":["Yongwei Zhang","Shuli Lv","Kairong Liu","Quanyi Liang","Quan Quan","Zhikun She"],"pdf_url":"https://arxiv.org/pdf/2501.11938v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11930v1","updated":"2025-01-21T07:08:53Z","published":"2025-01-21T07:08:53Z","title":"Nocturnal eye inspired liquid to gas phase change soft actuator with\n Laser-Induced-Graphene: enhanced environmental light harvesting and\n photothermal conversion","summary":" Robotic systems' mobility is constrained by power sources and wiring. While\npneumatic actuators remain tethered to air supplies, we developed a new\nactuator utilizing light energy. Inspired by nocturnal animals' eyes, we\ndesigned a bilayer soft actuator incorporating Laser-Induced Graphene (LIG) on\nthe inner surface of a silicone layer. This design maintains silicone's\ntransparency and flexibility while achieving 54% faster response time compared\nto conventional actuators through enhanced photothermal conversion.\n","authors":["Maina Sogabe","Youhyun Kim","Kenji Kawashima"],"pdf_url":"https://arxiv.org/pdf/2501.11930v1.pdf","comment":"23pages, 8 figures, journal paper"},{"id":"http://arxiv.org/abs/2310.20151v2","updated":"2025-01-21T06:26:43Z","published":"2023-10-31T03:37:11Z","title":"Multi-Agent Consensus Seeking via Large Language Models","summary":" Multi-agent systems driven by large language models (LLMs) have shown\npromising abilities for solving complex tasks in a collaborative manner. This\nwork considers a fundamental problem in multi-agent collaboration: consensus\nseeking. When multiple agents work together, we are interested in how they can\nreach a consensus through inter-agent negotiation. To that end, this work\nstudies a consensus-seeking task where the state of each agent is a numerical\nvalue and they negotiate with each other to reach a consensus value. It is\nrevealed that when not explicitly directed on which strategy should be adopted,\nthe LLM-driven agents primarily use the average strategy for consensus seeking\nalthough they may occasionally use some other strategies. Moreover, this work\nanalyzes the impact of the agent number, agent personality, and network\ntopology on the negotiation process. The findings reported in this work can\npotentially lay the foundations for understanding the behaviors of LLM-driven\nmulti-agent systems for solving more complex tasks. Furthermore, LLM-driven\nconsensus seeking is applied to a multi-robot aggregation task. This\napplication demonstrates the potential of LLM-driven agents to achieve\nzero-shot autonomous planning for multi-robot collaboration tasks. Project\nwebsite: windylab.github.io/ConsensusLLM/.\n","authors":["Huaben Chen","Wenkang Ji","Lufeng Xu","Shiyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.20151v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00204v5","updated":"2025-01-21T06:08:46Z","published":"2024-03-30T00:46:43Z","title":"AirPilot: Interpretable PPO-based DRL Auto-Tuned Nonlinear PID Drone\n Controller for Robust Autonomous Flights","summary":" Navigation precision, speed and stability are crucial for safe Unmanned\nAerial Vehicle (UAV) flight maneuvers and effective flight mission executions\nin dynamic environments. Different flight missions may have varying objectives,\nsuch as minimizing energy consumption, achieving precise positioning, or\nmaximizing speed. A controller that can adapt to different objectives on the\nfly is highly valuable. Proportional Integral Derivative (PID) controllers are\none of the most popular and widely used control algorithms for drones and other\ncontrol systems, but their linear control algorithm fails to capture the\nnonlinear nature of the dynamic wind conditions and complex drone system.\nManually tuning the PID gains for various missions can be time-consuming and\nrequires significant expertise. This paper aims to revolutionize drone flight\ncontrol by presenting the AirPilot, a nonlinear Deep Reinforcement Learning\n(DRL) - enhanced Proportional Integral Derivative (PID) drone controller using\nProximal Policy Optimization (PPO). AirPilot controller combines the simplicity\nand effectiveness of traditional PID control with the adaptability, learning\ncapability, and optimization potential of DRL. This makes it better suited for\nmodern drone applications where the environment is dynamic, and\nmission-specific performance demands are high. We employed a COEX Clover\nautonomous drone for training the DRL agent within the simulator and\nimplemented it in a real-world lab setting, which marks a significant milestone\nas one of the first attempts to apply a DRL-based flight controller on an\nactual drone. Airpilot is capable of reducing the navigation error of the\ndefault PX4 PID position controller by 90%, improving effective navigation\nspeed of a fine-tuned PID controller by 21%, reducing settling time and\novershoot by 17% and 16% respectively.\n","authors":["Junyang Zhang","Cristian Emanuel Ocampo Rivera","Kyle Tyni","Steven Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.00204v5.pdf","comment":"9 pages, 20 figures"},{"id":"http://arxiv.org/abs/2408.07855v3","updated":"2025-01-21T05:59:47Z","published":"2024-08-14T23:48:26Z","title":"Complementarity-Free Multi-Contact Modeling and Optimization for\n Dexterous Manipulation","summary":" A significant barrier preventing model-based methods from achieving real-time\nand versatile dexterous robotic manipulation is the inherent complexity of\nmulti-contact dynamics. Traditionally formulated as complementarity models,\nmulti-contact dynamics introduces non-smoothness and combinatorial complexity,\ncomplicating contact-rich planning and optimization. In this paper, we\ncircumvent these challenges by introducing a lightweight yet capable\nmulti-contact model. Our new model, derived from the duality of\noptimization-based contact models, dispenses with the complementarity\nconstructs entirely, providing computational advantages such as closed-form\ntime stepping, differentiability, automatic satisfaction with Coulomb friction\nlaw, and minimal hyperparameter tuning. We demonstrate the effectiveness and\nefficiency of the model for planning and control in a range of challenging\ndexterous manipulation tasks, including fingertip 3D in-air manipulation,\nTriFinger in-hand manipulation, and Allegro hand on-palm reorientation, all\nperformed with diverse objects. Our method consistently achieves\nstate-of-the-art results: (I) a 96.5% average success rate across all objects\nand tasks, (II) high manipulation accuracy with an average reorientation error\nof 11{\\deg} and position error of 7.8mm, and (III) contact-implicit model\npredictive control running at 50-100 Hz for all objects and tasks. These\nresults are achieved with minimal hyperparameter tuning.\n","authors":["Wanxin Jin"],"pdf_url":"https://arxiv.org/pdf/2408.07855v3.pdf","comment":"Video demo: https://youtu.be/NsL4hbSXvFg"},{"id":"http://arxiv.org/abs/2310.15846v4","updated":"2025-01-21T05:44:03Z","published":"2023-10-24T13:58:10Z","title":"Optimal Spatial-Temporal Triangulation for Bearing-Only Cooperative\n Motion Estimation","summary":" Vision-based cooperative motion estimation is an important problem for many\nmulti-robot systems such as cooperative aerial target pursuit. This problem can\nbe formulated as bearing-only cooperative motion estimation, where the visual\nmeasurement is modeled as a bearing vector pointing from the camera to the\ntarget. The conventional approaches for bearing-only cooperative estimation are\nmainly based on the framework distributed Kalman filtering (DKF). In this\npaper, we propose a new optimal bearing-only cooperative estimation algorithm,\nnamed spatial-temporal triangulation, based on the method of distributed\nrecursive least squares, which provides a more flexible framework for designing\ndistributed estimators than DKF. The design of the algorithm fully incorporates\nall the available information and the specific triangulation geometric\nconstraint. As a result, the algorithm has superior estimation performance than\nthe state-of-the-art DKF algorithms in terms of both accuracy and convergence\nspeed as verified by numerical simulation. We rigorously prove the exponential\nconvergence of the proposed algorithm. Moreover, to verify the effectiveness of\nthe proposed algorithm under practical challenging conditions, we develop a\nvision-based cooperative aerial target pursuit system, which is the first of\nsuch fully autonomous systems so far to the best of our knowledge.\n","authors":["Canlun Zheng","Yize Mi","Hanqing Guo","Huaben Chen","Zhiyun Lin","Shiyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.15846v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11893v1","updated":"2025-01-21T05:03:06Z","published":"2025-01-21T05:03:06Z","title":"DynoSAM: Open-Source Smoothing and Mapping Framework for Dynamic SLAM","summary":" Traditional Visual Simultaneous Localization and Mapping (vSLAM) systems\nfocus solely on static scene structures, overlooking dynamic elements in the\nenvironment. Although effective for accurate visual odometry in complex\nscenarios, these methods discard crucial information about moving objects. By\nincorporating this information into a Dynamic SLAM framework, the motion of\ndynamic entities can be estimated, enhancing navigation whilst ensuring\naccurate localization. However, the fundamental formulation of Dynamic SLAM\nremains an open challenge, with no consensus on the optimal approach for\naccurate motion estimation within a SLAM pipeline. Therefore, we developed\nDynoSAM, an open-source framework for Dynamic SLAM that enables the efficient\nimplementation, testing, and comparison of various Dynamic SLAM optimization\nformulations. DynoSAM integrates static and dynamic measurements into a unified\noptimization problem solved using factor graphs, simultaneously estimating\ncamera poses, static scene, object motion or poses, and object structures. We\nevaluate DynoSAM across diverse simulated and real-world datasets, achieving\nstate-of-the-art motion estimation in indoor and outdoor environments, with\nsubstantial improvements over existing systems. Additionally, we demonstrate\nDynoSAM utility in downstream applications, including 3D reconstruction of\ndynamic scenes and trajectory prediction, thereby showcasing potential for\nadvancing dynamic object-aware SLAM systems. DynoSAM is open-sourced at\nhttps://github.com/ACFR-RPG/DynOSAM.\n","authors":["Jesse Morris","Yiduo Wang","Mikolaj Kliniewski","Viorela Ila"],"pdf_url":"https://arxiv.org/pdf/2501.11893v1.pdf","comment":"20 pages, 10 figures. Submitted to T-RO Visual SLAM SI 2025"},{"id":"http://arxiv.org/abs/2501.11887v1","updated":"2025-01-21T04:53:17Z","published":"2025-01-21T04:53:17Z","title":"Connection-Coordination Rapport (CCR) Scale: A Dual-Factor Scale to\n Measure Human-Robot Rapport","summary":" Robots, particularly in service and companionship roles, must develop\npositive relationships with people they interact with regularly to be\nsuccessful. These positive human-robot relationships can be characterized as\nestablishing \"rapport,\" which indicates mutual understanding and interpersonal\nconnection that form the groundwork for successful long-term human-robot\ninteraction. However, the human-robot interaction research literature lacks\nscale instruments to assess human-robot rapport in a variety of situations. In\nthis work, we developed the 18-item Connection-Coordination Rapport (CCR) Scale\nto measure human-robot rapport. We first ran Study 1 (N = 288) where online\nparticipants rated videos of human-robot interactions using a set of candidate\nitems. Our Study 1 results showed the discovery of two factors in our scale,\nwhich we named \"Connection\" and \"Coordination.\" We then evaluated this scale by\nrunning Study 2 (N = 201) where online participants rated a new set of\nhuman-robot interaction videos with our scale and an existing rapport scale\nfrom virtual agents research for comparison. We also validated our scale by\nreplicating a prior in-person human-robot interaction study, Study 3 (N = 44),\nand found that rapport is rated significantly greater when participants\ninteracted with a responsive robot (responsive condition) as opposed to an\nunresponsive robot (unresponsive condition). Results from these studies\ndemonstrate high reliability and validity for the CCR scale, which can be used\nto measure rapport in both first-person and third-person perspectives. We\nencourage the adoption of this scale in future studies to measure rapport in a\nvariety of human-robot interactions.\n","authors":["Ting-Han Lin","Hannah Dinner","Tsz Long Leung","Bilge Mutlu","J. Gregory Trafton","Sarah Sebo"],"pdf_url":"https://arxiv.org/pdf/2501.11887v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.08907v3","updated":"2025-01-21T04:09:42Z","published":"2024-07-12T00:57:36Z","title":"Tightly-Coupled LiDAR-IMU-Wheel Odometry with an Online Neural Kinematic\n Model Learning via Factor Graph Optimization","summary":" Environments lacking geometric features (e.g., tunnels and long straight\ncorridors) are challenging for LiDAR-based odometry algorithms because LiDAR\npoint clouds degenerate in such environments. For wheeled robots, a wheel\nkinematic model (i.e., wheel odometry) can improve the reliability of the\nodometry estimation. However, the kinematic model suffers from complex motions\n(e.g., wheel slippage, lateral movement) in the case of skid-steering robots\nparticularly because this robot model rotates by skidding its wheels.\nFurthermore, these errors change nonlinearly when the wheel slippage is large\n(e.g., drifting) and are subject to terrain-dependent parameters. To\nsimultaneously tackle point cloud degeneration and the kinematic model errors,\nwe developed a LiDAR-IMU-wheel odometry algorithm incorporating online training\nof a neural network that learns the kinematic model of wheeled robots with\nnonlinearity. We propose to train the neural network online on a factor graph\nalong with robot states, allowing the learning-based kinematic model to adapt\nto the current terrain condition. The proposed method jointly solves online\ntraining of the neural network and LiDARIMUwheel odometry on a unified factor\ngraph to retain the consistency of all those constraints. Through experiments,\nwe first verified that the proposed network adapted to a changing environment,\nresulting in an accurate odometry estimation across different environments. We\nthen confirmed that the proposed odometry estimation algorithm was robust\nagainst point cloud degeneration and nonlinearity (e.g., large wheel slippage\nby drifting) of the kinematic model.\n","authors":["Taku Okawara","Kenji Koide","Shuji Oishi","Masashi Yokozuka","Atsuhiko Banno","Kentaro Uno","Kazuya Yoshida"],"pdf_url":"https://arxiv.org/pdf/2407.08907v3.pdf","comment":"https://youtu.be/CvRVhdda7Cw"},{"id":"http://arxiv.org/abs/2408.11051v2","updated":"2025-01-21T04:06:09Z","published":"2024-08-20T17:57:46Z","title":"FLAME: Learning to Navigate with Multimodal LLM in Urban Environments","summary":" Large Language Models (LLMs) have demonstrated potential in\nVision-and-Language Navigation (VLN) tasks, yet current applications face\nchallenges. While LLMs excel in general conversation scenarios, they struggle\nwith specialized navigation tasks, yielding suboptimal performance compared to\nspecialized VLN models. We introduce FLAME (FLAMingo-Architected Embodied\nAgent), a novel Multimodal LLM-based agent and architecture designed for urban\nVLN tasks that efficiently handles multiple observations. Our approach\nimplements a three-phase tuning technique for effective adaptation to\nnavigation tasks, including single perception tuning for street view\ndescription, multiple perception tuning for route summarization, and end-to-end\ntraining on VLN datasets. The augmented datasets are synthesized automatically.\nExperimental results demonstrate FLAME's superiority over existing methods,\nsurpassing state-of-the-art methods by a 7.3% increase in task completion on\nTouchdown dataset. This work showcases the potential of Multimodal LLMs (MLLMs)\nin complex navigation tasks, representing an advancement towards applications\nof MLLMs in the field of embodied intelligence.\n","authors":["Yunzhe Xu","Yiyuan Pan","Zhe Liu","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11051v2.pdf","comment":"Accepted to AAAI 2025 (Oral)"},{"id":"http://arxiv.org/abs/2405.16960v2","updated":"2025-01-21T03:49:48Z","published":"2024-05-27T08:55:17Z","title":"DCPI-Depth: Explicitly Infusing Dense Correspondence Prior to\n Unsupervised Monocular Depth Estimation","summary":" There has been a recent surge of interest in learning to perceive depth from\nmonocular videos in an unsupervised fashion. A key challenge in this field is\nachieving robust and accurate depth estimation in challenging scenarios,\nparticularly in regions with weak textures or where dynamic objects are\npresent. This study makes three major contributions by delving deeply into\ndense correspondence priors to provide existing frameworks with explicit\ngeometric constraints. The first novelty is a contextual-geometric depth\nconsistency loss, which employs depth maps triangulated from dense\ncorrespondences based on estimated ego-motion to guide the learning of depth\nperception from contextual information, since explicitly triangulated depth\nmaps capture accurate relative distances among pixels. The second novelty\narises from the observation that there exists an explicit, deducible\nrelationship between optical flow divergence and depth gradient. A differential\nproperty correlation loss is, therefore, designed to refine depth estimation\nwith a specific emphasis on local variations. The third novelty is a\nbidirectional stream co-adjustment strategy that enhances the interaction\nbetween rigid and optical flows, encouraging the former towards more accurate\ncorrespondence and making the latter more adaptable across various scenarios\nunder the static scene hypotheses. DCPI-Depth, a framework that incorporates\nall these innovative components and couples two bidirectional and collaborative\nstreams, achieves state-of-the-art performance and generalizability across\nmultiple public datasets, outperforming all existing prior arts. Specifically,\nit demonstrates accurate depth estimation in texture-less and dynamic regions,\nand shows more reasonable smoothness. Our source code will be publicly\navailable at mias.group/DCPI-Depth upon publication.\n","authors":["Mengtan Zhang","Yi Feng","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2405.16960v2.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.18313v5","updated":"2025-01-21T02:38:32Z","published":"2024-09-26T21:44:11Z","title":"Embodied-RAG: General Non-parametric Embodied Memory for Retrieval and\n Generation","summary":" There is no limit to how much a robot might explore and learn, but all of\nthat knowledge needs to be searchable and actionable. Within language research,\nretrieval augmented generation (RAG) has become the workhorse of large-scale\nnon-parametric knowledge; however, existing techniques do not directly transfer\nto the embodied domain, which is multimodal, where data is highly correlated,\nand perception requires abstraction. To address these challenges, we introduce\nEmbodied-RAG, a framework that enhances the foundational model of an embodied\nagent with a non-parametric memory system capable of autonomously constructing\nhierarchical knowledge for both navigation and language generation.\nEmbodied-RAG handles a full range of spatial and semantic resolutions across\ndiverse environments and query types, whether for a specific object or a\nholistic description of ambiance. At its core, Embodied-RAG's memory is\nstructured as a semantic forest, storing language descriptions at varying\nlevels of detail. This hierarchical organization allows the system to\nefficiently generate context-sensitive outputs across different robotic\nplatforms. We demonstrate that Embodied-RAG effectively bridges RAG to the\nrobotics domain, successfully handling over 250 explanation and navigation\nqueries across kilometer-level environments, highlighting its promise as a\ngeneral-purpose non-parametric system for embodied agents.\n","authors":["Quanting Xie","So Yeon Min","Pengliang Ji","Yue Yang","Tianyi Zhang","Kedi Xu","Aarav Bajaj","Ruslan Salakhutdinov","Matthew Johnson-Roberson","Yonatan Bisk"],"pdf_url":"https://arxiv.org/pdf/2409.18313v5.pdf","comment":"Web: https://quanting-xie.github.io/Embodied-RAG-web/"},{"id":"http://arxiv.org/abs/2501.11803v1","updated":"2025-01-21T00:44:18Z","published":"2025-01-21T00:44:18Z","title":"Automating High Quality RT Planning at Scale","summary":" Radiotherapy (RT) planning is complex, subjective, and time-intensive.\nAdvances in artificial intelligence (AI) promise to improve its precision,\nefficiency, and consistency, but progress is often limited by the scarcity of\nlarge, standardized datasets. To address this, we introduce the Automated\nIterative RT Planning (AIRTP) system, a scalable solution for generating\nhigh-quality treatment plans. This scalable solution is designed to generate\nsubstantial volumes of consistently high-quality treatment plans, overcoming a\nkey obstacle in the advancement of AI-driven RT planning. Our AIRTP pipeline\nadheres to clinical guidelines and automates essential steps, including\norgan-at-risk (OAR) contouring, helper structure creation, beam setup,\noptimization, and plan quality improvement, using AI integrated with RT\nplanning software like Eclipse of Varian. Furthermore, a novel approach for\ndetermining optimization parameters to reproduce 3D dose distributions, i.e. a\nmethod to convert dose predictions to deliverable treatment plans constrained\nby machine limitations. A comparative analysis of plan quality reveals that our\nautomated pipeline produces treatment plans of quality comparable to those\ngenerated manually, which traditionally require several hours of labor per\nplan. Committed to public research, the first data release of our AIRTP\npipeline includes nine cohorts covering head-and-neck and lung cancer sites to\nsupport an AAPM 2025 challenge. This data set features more than 10 times the\nnumber of plans compared to the largest existing well-curated public data set\nto our best knowledge.\nRepo:{https://github.com/RiqiangGao/GDP-HMM_AAPMChallenge}\n","authors":["Riqiang Gao","Mamadou Diallo","Han Liu","Anthony Magliari","Jonathan Sackett","Wilko Verbakel","Sandra Meyers","Masoud Zarepisheh","Rafe Mcbeth","Simon Arberet","Martin Kraus","Florin C. Ghesu","Ali Kamen"],"pdf_url":"https://arxiv.org/pdf/2501.11803v1.pdf","comment":"Related to GDP-HMM grand challenge"},{"id":"http://arxiv.org/abs/2501.12536v1","updated":"2025-01-21T22:59:50Z","published":"2025-01-21T22:59:50Z","title":"Interaction Dataset of Autonomous Vehicles with Traffic Lights and Signs","summary":" This paper presents the development of a comprehensive dataset capturing\ninteractions between Autonomous Vehicles (AVs) and traffic control devices,\nspecifically traffic lights and stop signs. Derived from the Waymo Motion\ndataset, our work addresses a critical gap in the existing literature by\nproviding real-world trajectory data on how AVs navigate these traffic control\ndevices. We propose a methodology for identifying and extracting relevant\ninteraction trajectory data from the Waymo Motion dataset, incorporating over\n37,000 instances with traffic lights and 44,000 with stop signs. Our\nmethodology includes defining rules to identify various interaction types,\nextracting trajectory data, and applying a wavelet-based denoising method to\nsmooth the acceleration and speed profiles and eliminate anomalous values,\nthereby enhancing the trajectory quality. Quality assessment metrics indicate\nthat trajectories obtained in this study have anomaly proportions in\nacceleration and jerk profiles reduced to near-zero levels across all\ninteraction categories. By making this dataset publicly available, we aim to\naddress the current gap in datasets containing AV interaction behaviors with\ntraffic lights and signs. Based on the organized and published dataset, we can\ngain a more in-depth understanding of AVs' behavior when interacting with\ntraffic lights and signs. This will facilitate research on AV integration into\nexisting transportation infrastructures and networks, supporting the\ndevelopment of more accurate behavioral models and simulation tools.\n","authors":["Zheng Li","Zhipeng Bao","Haoming Meng","Haotian Shi","Qianwen Li","Handong Yao","Xiaopeng Li"],"pdf_url":"https://arxiv.org/pdf/2501.12536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11783v2","updated":"2025-01-21T21:46:26Z","published":"2024-10-15T17:02:32Z","title":"LatentBKI: Open-Dictionary Continuous Mapping in Visual-Language Latent\n Spaces with Quantifiable Uncertainty","summary":" This paper introduces a novel probabilistic mapping algorithm, LatentBKI,\nwhich enables open-vocabulary mapping with quantifiable uncertainty.\nTraditionally, semantic mapping algorithms focus on a fixed set of semantic\ncategories which limits their applicability for complex robotic tasks.\nVision-Language (VL) models have recently emerged as a technique to jointly\nmodel language and visual features in a latent space, enabling semantic\nrecognition beyond a predefined, fixed set of semantic classes. LatentBKI\nrecurrently incorporates neural embeddings from VL models into a voxel map with\nquantifiable uncertainty, leveraging the spatial correlations of nearby\nobservations through Bayesian Kernel Inference (BKI). LatentBKI is evaluated\nagainst similar explicit semantic mapping and VL mapping frameworks on the\npopular Matterport3D and Semantic KITTI datasets, demonstrating that LatentBKI\nmaintains the probabilistic benefits of continuous mapping with the additional\nbenefit of open-dictionary queries. Real-world experiments demonstrate\napplicability to challenging indoor environments.\n","authors":["Joey Wilson","Ruihan Xu","Yile Sun","Parker Ewen","Minghan Zhu","Kira Barton","Maani Ghaffari"],"pdf_url":"https://arxiv.org/pdf/2410.11783v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01737v2","updated":"2025-01-21T21:30:18Z","published":"2024-08-03T10:39:42Z","title":"Tightly Coupled SLAM with Imprecise Architectural Plans","summary":" Robots navigating indoor environments often have access to architectural\nplans, which can serve as prior knowledge to enhance their localization and\nmapping capabilities. While some SLAM algorithms leverage these plans for\nglobal localization in real-world environments, they typically overlook a\ncritical challenge: the \"as-planned\" architectural designs frequently deviate\nfrom the \"as-built\" real-world environments. To address this gap, we present a\nnovel algorithm that tightly couples LIDAR-based simultaneous localization and\nmapping with architectural plans under the presence of deviations. Our method\nutilizes a multi-layered semantic representation to not only localize the\nrobot, but also to estimate global alignment and structural deviations between\n\"as-planned\" and as-built environments in real-time. To validate our approach,\nwe performed experiments in simulated and real datasets demonstrating\nrobustness to structural deviations up to 35 cm and 15 degrees. On average, our\nmethod achieves 43% less localization error than baselines in simulated\nenvironments, while in real environments, the as-built 3D maps show 7% lower\naverage alignment error\n","authors":["Muhammad Shaheer","Jose Andres Millan-Romera","Hriday Bavle","Marco Giberna","Jose Luis Sanchez-Lopez","Javier Civera","Holger Voos"],"pdf_url":"https://arxiv.org/pdf/2408.01737v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12493v1","updated":"2025-01-21T20:43:33Z","published":"2025-01-21T20:43:33Z","title":"ELEGNT: Expressive and Functional Movement Design for\n Non-anthropomorphic Robot","summary":" Nonverbal behaviors such as posture, gestures, and gaze are essential for\nconveying internal states, both consciously and unconsciously, in human\ninteraction. For robots to interact more naturally with humans, robot movement\ndesign should likewise integrate expressive qualities, such as intention,\nattention, and emotions, alongside traditional functional considerations like\ntask fulfillment and time efficiency. In this paper, we present the design and\nprototyping of a lamp-like robot that explores the interplay between functional\nand expressive objectives in movement design. Using a research-through-design\nmethodology, we document the hardware design process, define expressive\nmovement primitives, and outline a set of interaction scenario storyboards. We\npropose a framework that incorporates both functional and expressive utilities\nduring movement generation, and implement the robot behavior sequences in\ndifferent function- and social- oriented tasks. Through a user study comparing\nexpression-driven versus function-driven movements across six task scenarios,\nour findings indicate that expression-driven movements significantly enhance\nuser engagement and perceived robot qualities. This effect is especially\npronounced in social-oriented tasks.\n","authors":["Yuhan Hu","Peide Huang","Mouli Sivapurapu","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.12493v1.pdf","comment":"13 pages, manuscript under review"},{"id":"http://arxiv.org/abs/2501.12482v1","updated":"2025-01-21T20:20:34Z","published":"2025-01-21T20:20:34Z","title":"TOFFE -- Temporally-binned Object Flow from Events for High-speed and\n Energy-Efficient Object Detection and Tracking","summary":" Object detection and tracking is an essential perception task for enabling\nfully autonomous navigation in robotic systems. Edge robot systems such as\nsmall drones need to execute complex maneuvers at high-speeds with limited\nresources, which places strict constraints on the underlying algorithms and\nhardware. Traditionally, frame-based cameras are used for vision-based\nperception due to their rich spatial information and simplified synchronous\nsensing capabilities. However, obtaining detailed information across frames\nincurs high energy consumption and may not even be required. In addition, their\nlow temporal resolution renders them ineffective in high-speed motion\nscenarios. Event-based cameras offer a biologically-inspired solution to this\nby capturing only changes in intensity levels at exceptionally high temporal\nresolution and low power consumption, making them ideal for high-speed motion\nscenarios. However, their asynchronous and sparse outputs are not natively\nsuitable with conventional deep learning methods. In this work, we propose\nTOFFE, a lightweight hybrid framework for performing event-based object motion\nestimation (including pose, direction, and speed estimation), referred to as\nObject Flow. TOFFE integrates bio-inspired Spiking Neural Networks (SNNs) and\nconventional Analog Neural Networks (ANNs), to efficiently process events at\nhigh temporal resolutions while being simple to train. Additionally, we present\na novel event-based synthetic dataset involving high-speed object motion to\ntrain TOFFE. Our experimental results show that TOFFE achieves 5.7x/8.3x\nreduction in energy consumption and 4.6x/5.8x reduction in latency on edge\nGPU(Jetson TX2)/hybrid hardware(Loihi-2 and Jetson TX2), compared to previous\nevent-based object detection baselines.\n","authors":["Adarsh Kumar Kosta","Amogh Joshi","Arjun Roy","Rohan Kumar Manna","Manish Nagaraj","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2501.12482v1.pdf","comment":"8 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2501.13960v1","updated":"2025-01-21T07:57:07Z","published":"2025-01-21T07:57:07Z","title":"LiCAR: pseudo-RGB LiDAR image for CAR segmentation","summary":" With the advancement of computing resources, an increasing number of Neural\nNetworks (NNs) are appearing for image detection and segmentation appear.\nHowever, these methods usually accept as input a RGB 2D image. On the other\nside, Light Detection And Ranging (LiDAR) sensors with many layers provide\nimages that are similar to those obtained from a traditional low resolution RGB\ncamera. Following this principle, a new dataset for segmenting cars in\npseudo-RGB images has been generated. This dataset combines the information\ngiven by the LiDAR sensor into a Spherical Range Image (SRI), concretely the\nreflectivity, near infrared and signal intensity 2D images. These images are\nthen fed into instance segmentation NNs. These NNs segment the cars that appear\nin these images, having as result a Bounding Box (BB) and mask precision of 88%\nand 81.5% respectively with You Only Look Once (YOLO)-v8 large. By using this\nsegmentation NN, some trackers have been applied so as to follow each car\nsegmented instance along a video feed, having great performance in real world\nexperiments.\n","authors":["Ignacio de Loyola Páez-Ubieta","Edison P. Velasco-Sánchez","Santiago T. Puente"],"pdf_url":"https://arxiv.org/pdf/2501.13960v1.pdf","comment":"This is a preprint version of the work accepted at 5th International\n Conference on Robotics, Computer Vision and Intelligent Systems (ROBOVIS\n 2025)"},{"id":"http://arxiv.org/abs/2501.14824v1","updated":"2025-01-21T16:37:17Z","published":"2025-01-21T16:37:17Z","title":"A causal learning approach to in-orbit inertial parameter estimation for\n multi-payload deployers","summary":" This paper discusses an approach to inertial parameter estimation for the\ncase of cargo carrying spacecraft that is based on causal learning, i.e.\nlearning from the responses of the spacecraft, under actuation. Different\nspacecraft configurations (inertial parameter sets) are simulated under\ndifferent actuation profiles, in order to produce an optimised time-series\nclustering classifier that can be used to distinguish between them. The\nactuation is comprised of finite sequences of constant inputs that are applied\nin order, based on typical actuators available. By learning from the system's\nresponses across multiple input sequences, and then applying measures of\ntime-series similarity and F1-score, an optimal actuation sequence can be\nchosen either for one specific system configuration or for the overall set of\npossible configurations. This allows for both estimation of the inertial\nparameter set without any prior knowledge of state, as well as validation of\ntransitions between different configurations after a deployment event. The\noptimisation of the actuation sequence is handled by a reinforcement learning\nmodel that uses the proximal policy optimisation (PPO) algorithm, by repeatedly\ntrying different sequences and evaluating the impact on classifier performance\naccording to a multi-objective metric.\n","authors":["Konstantinos Platanitis","Miguel Arana-Catania","Saurabh Upadhyay","Leonard Felicetti"],"pdf_url":"https://arxiv.org/pdf/2501.14824v1.pdf","comment":"10 pages, 18 figures, 1 table. Presented in 75th International\n Astronautical Congress (IAC), Milan, Italy, 14-18 October 2024"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.12393v1","updated":"2025-01-21T18:59:59Z","published":"2025-01-21T18:59:59Z","title":"Towards Affordance-Aware Articulation Synthesis for Rigged Objects","summary":" Rigged objects are commonly used in artist pipelines, as they can flexibly\nadapt to different scenes and postures. However, articulating the rigs into\nrealistic affordance-aware postures (e.g., following the context, respecting\nthe physics and the personalities of the object) remains time-consuming and\nheavily relies on human labor from experienced artists. In this paper, we\ntackle the novel problem and design A3Syn. With a given context, such as the\nenvironment mesh and a text prompt of the desired posture, A3Syn synthesizes\narticulation parameters for arbitrary and open-domain rigged objects obtained\nfrom the Internet. The task is incredibly challenging due to the lack of\ntraining data, and we do not make any topological assumptions about the\nopen-domain rigs. We propose using 2D inpainting diffusion model and several\ncontrol techniques to synthesize in-context affordance information. Then, we\ndevelop an efficient bone correspondence alignment using a combination of\ndifferentiable rendering and semantic correspondence. A3Syn has stable\nconvergence, completes in minutes, and synthesizes plausible affordance on\ndifferent combinations of in-the-wild object rigs and scenes.\n","authors":["Yu-Chu Yu","Chieh Hubert Lin","Hsin-Ying Lee","Chaoyang Wang","Yu-Chiang Frank Wang","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2501.12393v1.pdf","comment":"Project page: https://chuyu.org/research/a3syn"},{"id":"http://arxiv.org/abs/2501.12392v1","updated":"2025-01-21T18:59:53Z","published":"2025-01-21T18:59:53Z","title":"Learning segmentation from point trajectories","summary":" We consider the problem of segmenting objects in videos based on their motion\nand no other forms of supervision. Prior work has often approached this problem\nby using the principle of common fate, namely the fact that the motion of\npoints that belong to the same object is strongly correlated. However, most\nauthors have only considered instantaneous motion from optical flow. In this\nwork, we present a way to train a segmentation network using long-term point\ntrajectories as a supervisory signal to complement optical flow. The key\ndifficulty is that long-term motion, unlike instantaneous motion, is difficult\nto model -- any parametric approximation is unlikely to capture complex motion\npatterns over long periods of time. We instead draw inspiration from subspace\nclustering approaches, proposing a loss function that seeks to group the\ntrajectories into low-rank matrices where the motion of object points can be\napproximately explained as a linear combination of other point tracks. Our\nmethod outperforms the prior art on motion-based segmentation, which shows the\nutility of long-term motion and the effectiveness of our formulation.\n","authors":["Laurynas Karazija","Iro Laina","Christian Rupprecht","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2501.12392v1.pdf","comment":"NeurIPS 2024 Spotlight. Project\n https://www.robots.ox.ac.uk/~vgg/research/lrtl/"},{"id":"http://arxiv.org/abs/2501.12390v1","updated":"2025-01-21T18:59:46Z","published":"2025-01-21T18:59:46Z","title":"GPS as a Control Signal for Image Generation","summary":" We show that the GPS tags contained in photo metadata provide a useful\ncontrol signal for image generation. We train GPS-to-image models and use them\nfor tasks that require a fine-grained understanding of how images vary within a\ncity. In particular, we train a diffusion model to generate images conditioned\non both GPS and text. The learned model generates images that capture the\ndistinctive appearance of different neighborhoods, parks, and landmarks. We\nalso extract 3D models from 2D GPS-to-image models through score distillation\nsampling, using GPS conditioning to constrain the appearance of the\nreconstruction from each viewpoint. Our evaluations suggest that our\nGPS-conditioned models successfully learn to generate images that vary based on\nlocation, and that GPS conditioning improves estimated 3D structure.\n","authors":["Chao Feng","Ziyang Chen","Aleksander Holynski","Alexei A. Efros","Andrew Owens"],"pdf_url":"https://arxiv.org/pdf/2501.12390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12389v1","updated":"2025-01-21T18:59:31Z","published":"2025-01-21T18:59:31Z","title":"Taming Teacher Forcing for Masked Autoregressive Video Generation","summary":" We introduce MAGI, a hybrid video generation framework that combines masked\nmodeling for intra-frame generation with causal modeling for next-frame\ngeneration. Our key innovation, Complete Teacher Forcing (CTF), conditions\nmasked frames on complete observation frames rather than masked ones (namely\nMasked Teacher Forcing, MTF), enabling a smooth transition from token-level\n(patch-level) to frame-level autoregressive generation. CTF significantly\noutperforms MTF, achieving a +23% improvement in FVD scores on first-frame\nconditioned video prediction. To address issues like exposure bias, we employ\ntargeted training strategies, setting a new benchmark in autoregressive video\ngeneration. Experiments show that MAGI can generate long, coherent video\nsequences exceeding 100 frames, even when trained on as few as 16 frames,\nhighlighting its potential for scalable, high-quality video generation.\n","authors":["Deyu Zhou","Quan Sun","Yuang Peng","Kun Yan","Runpei Dong","Duomin Wang","Zheng Ge","Nan Duan","Xiangyu Zhang","Lionel M. Ni","Heung-Yeung Shum"],"pdf_url":"https://arxiv.org/pdf/2501.12389v1.pdf","comment":"12 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.12387v1","updated":"2025-01-21T18:59:23Z","published":"2025-01-21T18:59:23Z","title":"Continuous 3D Perception Model with Persistent State","summary":" We present a unified framework capable of solving a broad range of 3D tasks.\nOur approach features a stateful recurrent model that continuously updates its\nstate representation with each new observation. Given a stream of images, this\nevolving state can be used to generate metric-scale pointmaps (per-pixel 3D\npoints) for each new input in an online fashion. These pointmaps reside within\na common coordinate system, and can be accumulated into a coherent, dense scene\nreconstruction that updates as new images arrive. Our model, called CUT3R\n(Continuous Updating Transformer for 3D Reconstruction), captures rich priors\nof real-world scenes: not only can it predict accurate pointmaps from image\nobservations, but it can also infer unseen regions of the scene by probing at\nvirtual, unobserved views. Our method is simple yet highly flexible, naturally\naccepting varying lengths of images that may be either video streams or\nunordered photo collections, containing both static and dynamic content. We\nevaluate our method on various 3D/4D tasks and demonstrate competitive or\nstate-of-the-art performance in each. Project Page: https://cut3r.github.io/\n","authors":["Qianqian Wang","Yifei Zhang","Aleksander Holynski","Alexei A. Efros","Angjoo Kanazawa"],"pdf_url":"https://arxiv.org/pdf/2501.12387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12386v1","updated":"2025-01-21T18:59:00Z","published":"2025-01-21T18:59:00Z","title":"InternVideo2.5: Empowering Video MLLMs with Long and Rich Context\n Modeling","summary":" This paper aims to improve the performance of video multimodal large language\nmodels (MLLM) via long and rich context (LRC) modeling. As a result, we develop\na new version of InternVideo2.5 with a focus on enhancing the original MLLMs'\nability to perceive fine-grained details and capture long-form temporal\nstructure in videos. Specifically, our approach incorporates dense vision task\nannotations into MLLMs using direct preference optimization and develops\ncompact spatiotemporal representations through adaptive hierarchical token\ncompression. Experimental results demonstrate this unique design of LRC greatly\nimproves the results of video MLLM in mainstream video understanding benchmarks\n(short & long), enabling the MLLM to memorize significantly longer video inputs\n(at least 6x longer than the original), and master specialized vision\ncapabilities like object tracking and segmentation. Our work highlights the\nimportance of multimodal context richness (length and fineness) in empowering\nMLLM's innate abilites (focus and memory), providing new insights for future\nresearch on video MLLM. Code and models are available at\nhttps://github.com/OpenGVLab/InternVideo/tree/main/InternVideo2.5\n","authors":["Yi Wang","Xinhao Li","Ziang Yan","Yinan He","Jiashuo Yu","Xiangyu Zeng","Chenting Wang","Changlian Ma","Haian Huang","Jianfei Gao","Min Dou","Kai Chen","Wenhai Wang","Yu Qiao","Yali Wang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2501.12386v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2501.12384v1","updated":"2025-01-21T18:57:34Z","published":"2025-01-21T18:57:34Z","title":"CCESAR: Coastline Classification-Extraction From SAR Images Using\n CNN-U-Net Combination","summary":" In this article, we improve the deep learning solution for coastline\nextraction from Synthetic Aperture Radar (SAR) images by proposing a two-stage\nmodel involving image classification followed by segmentation. We hypothesize\nthat a single segmentation model usually used for coastline detection is\ninsufficient to characterize different coastline types. We demonstrate that the\nneed for a two-stage workflow prevails through different compression levels of\nthese images. Our results from experiments using a combination of CNN and U-Net\nmodels on Sentinel-1 images show that the two-stage workflow, coastline\nclassification-extraction from SAR images (CCESAR) outperforms a single U-Net\nsegmentation model.\n","authors":["Vidhu Arora","Shreyan Gupta","Ananthakrishna Kudupu","Aditya Priyadarshi","Aswathi Mundayatt","Jaya Sreevalsan-Nair"],"pdf_url":"https://arxiv.org/pdf/2501.12384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12382v1","updated":"2025-01-21T18:56:41Z","published":"2025-01-21T18:56:41Z","title":"DiffDoctor: Diagnosing Image Diffusion Models Before Treating","summary":" In spite of the recent progress, image diffusion models still produce\nartifacts. A common solution is to refine an established model with a quality\nassessment system, which generally rates an image in its entirety. In this\nwork, we believe problem-solving starts with identification, yielding the\nrequest that the model should be aware of not just the presence of defects in\nan image, but their specific locations. Motivated by this, we propose\nDiffDoctor, a two-stage pipeline to assist image diffusion models in generating\nfewer artifacts. Concretely, the first stage targets developing a robust\nartifact detector, for which we collect a dataset of over 1M flawed synthesized\nimages and set up an efficient human-in-the-loop annotation process,\nincorporating a carefully designed class-balance strategy. The learned artifact\ndetector is then involved in the second stage to tune the diffusion model\nthrough assigning a per-pixel confidence map for each synthesis. Extensive\nexperiments on text-to-image diffusion models demonstrate the effectiveness of\nour artifact detector as well as the soundness of our diagnose-then-treat\ndesign.\n","authors":["Yiyang Wang","Xi Chen","Xiaogang Xu","Sihui Ji","Yu Liu","Yujun Shen","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.12382v1.pdf","comment":"8 pages of main body and 2 pages of references, 9 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.12381v1","updated":"2025-01-21T18:56:19Z","published":"2025-01-21T18:56:19Z","title":"Parallel Sequence Modeling via Generalized Spatial Propagation Network","summary":" We present the Generalized Spatial Propagation Network (GSPN), a new\nattention mechanism optimized for vision tasks that inherently captures 2D\nspatial structures. Existing attention models, including transformers, linear\nattention, and state-space models like Mamba, process multi-dimensional data as\n1D sequences, compromising spatial coherence and efficiency. GSPN overcomes\nthese limitations by directly operating on spatially coherent image data and\nforming dense pairwise connections through a line-scan approach. Central to\nGSPN is the Stability-Context Condition, which ensures stable, context-aware\npropagation across 2D sequences and reduces the effective sequence length to\n$\\sqrt{N}$ for a square map with N elements, significantly enhancing\ncomputational efficiency. With learnable, input-dependent weights and no\nreliance on positional embeddings, GSPN achieves superior spatial fidelity and\nstate-of-the-art performance in vision tasks, including ImageNet\nclassification, class-guided image generation, and text-to-image generation.\nNotably, GSPN accelerates SD-XL with softmax-attention by over $84\\times$ when\ngenerating 16K images.\n","authors":["Hongjun Wang","Wonmin Byeon","Jiarui Xu","Jinwei Gu","Ka Chun Cheung","Xiaolong Wang","Kai Han","Jan Kautz","Sifei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.12381v1.pdf","comment":"Project page: http://whj363636.github.io/GSPN/"},{"id":"http://arxiv.org/abs/2501.12380v1","updated":"2025-01-21T18:56:18Z","published":"2025-01-21T18:56:18Z","title":"MMVU: Measuring Expert-Level Multi-Discipline Video Understanding","summary":" We introduce MMVU, a comprehensive expert-level, multi-discipline benchmark\nfor evaluating foundation models in video understanding. MMVU includes 3,000\nexpert-annotated questions spanning 27 subjects across four core disciplines:\nScience, Healthcare, Humanities & Social Sciences, and Engineering. Compared to\nprior benchmarks, MMVU features three key advancements. First, it challenges\nmodels to apply domain-specific knowledge and perform expert-level reasoning to\nanalyze specialized-domain videos, moving beyond the basic visual perception\ntypically assessed in current video benchmarks. Second, each example is\nannotated by human experts from scratch. We implement strict data quality\ncontrols to ensure the high quality of the dataset. Finally, each example is\nenriched with expert-annotated reasoning rationals and relevant domain\nknowledge, facilitating in-depth analysis. We conduct an extensive evaluation\nof 32 frontier multimodal foundation models on MMVU. The latest\nSystem-2-capable models, o1 and Gemini 2.0 Flash Thinking, achieve the highest\nperformance among the tested models. However, they still fall short of matching\nhuman expertise. Through in-depth error analyses and case studies, we offer\nactionable insights for future advancements in expert-level,\nknowledge-intensive video understanding for specialized domains.\n","authors":["Yilun Zhao","Lujing Xie","Haowei Zhang","Guo Gan","Yitao Long","Zhiyuan Hu","Tongyan Hu","Weiyuan Chen","Chuhan Li","Junyang Song","Zhijian Xu","Chengye Wang","Weifeng Pan","Ziyao Shangguan","Xiangru Tang","Zhenwen Liang","Yixin Liu","Chen Zhao","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2501.12380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12375v1","updated":"2025-01-21T18:53:30Z","published":"2025-01-21T18:53:30Z","title":"Video Depth Anything: Consistent Depth Estimation for Super-Long Videos","summary":" Depth Anything has achieved remarkable success in monocular depth estimation\nwith strong generalization ability. However, it suffers from temporal\ninconsistency in videos, hindering its practical applications. Various methods\nhave been proposed to alleviate this issue by leveraging video generation\nmodels or introducing priors from optical flow and camera poses. Nonetheless,\nthese methods are only applicable to short videos (< 10 seconds) and require a\ntrade-off between quality and computational efficiency. We propose Video Depth\nAnything for high-quality, consistent depth estimation in super-long videos\n(over several minutes) without sacrificing efficiency. We base our model on\nDepth Anything V2 and replace its head with an efficient spatial-temporal head.\nWe design a straightforward yet effective temporal consistency loss by\nconstraining the temporal depth gradient, eliminating the need for additional\ngeometric priors. The model is trained on a joint dataset of video depth and\nunlabeled images, similar to Depth Anything V2. Moreover, a novel\nkey-frame-based strategy is developed for long video inference. Experiments\nshow that our model can be applied to arbitrarily long videos without\ncompromising quality, consistency, or generalization ability. Comprehensive\nevaluations on multiple video benchmarks demonstrate that our approach sets a\nnew state-of-the-art in zero-shot video depth estimation. We offer models of\ndifferent scales to support a range of scenarios, with our smallest model\ncapable of real-time performance at 30 FPS.\n","authors":["Sili Chen","Hengkai Guo","Shengnan Zhu","Feihu Zhang","Zilong Huang","Jiashi Feng","Bingyi Kang"],"pdf_url":"https://arxiv.org/pdf/2501.12375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12369v1","updated":"2025-01-21T18:49:06Z","published":"2025-01-21T18:49:06Z","title":"DARB-Splatting: Generalizing Splatting with Decaying Anisotropic Radial\n Basis Functions","summary":" Splatting-based 3D reconstruction methods have gained popularity with the\nadvent of 3D Gaussian Splatting, efficiently synthesizing high-quality novel\nviews. These methods commonly resort to using exponential family functions,\nsuch as the Gaussian function, as reconstruction kernels due to their\nanisotropic nature, ease of projection, and differentiability in rasterization.\nHowever, the field remains restricted to variations within the exponential\nfamily, leaving generalized reconstruction kernels largely underexplored,\npartly due to the lack of easy integrability in 3D to 2D projections. In this\nlight, we show that a class of decaying anisotropic radial basis functions\n(DARBFs), which are non-negative functions of the Mahalanobis distance,\nsupports splatting by approximating the Gaussian function's closed-form\nintegration advantage. With this fresh perspective, we demonstrate up to 34%\nfaster convergence during training and a 15% reduction in memory consumption\nacross various DARB reconstruction kernels, while maintaining comparable PSNR,\nSSIM, and LPIPS results. We will make the code available.\n","authors":["Vishagar Arunan","Saeedha Nazar","Hashiru Pramuditha","Vinasirajan Viruthshaan","Sameera Ramasinghe","Simon Lucey","Ranga Rodrigo"],"pdf_url":"https://arxiv.org/pdf/2501.12369v1.pdf","comment":"Link to the project page:\n https://randomnerds.github.io/darbs.github.io/"},{"id":"http://arxiv.org/abs/2501.12368v1","updated":"2025-01-21T18:47:32Z","published":"2025-01-21T18:47:32Z","title":"InternLM-XComposer2.5-Reward: A Simple Yet Effective Multi-Modal Reward\n Model","summary":" Despite the promising performance of Large Vision Language Models (LVLMs) in\nvisual understanding, they occasionally generate incorrect outputs. While\nreward models (RMs) with reinforcement learning or test-time scaling offer the\npotential for improving generation quality, a critical gap remains: publicly\navailable multi-modal RMs for LVLMs are scarce, and the implementation details\nof proprietary models are often unclear. We bridge this gap with\nInternLM-XComposer2.5-Reward (IXC-2.5-Reward), a simple yet effective\nmulti-modal reward model that aligns LVLMs with human preferences. To ensure\nthe robustness and versatility of IXC-2.5-Reward, we set up a high-quality\nmulti-modal preference corpus spanning text, image, and video inputs across\ndiverse domains, such as instruction following, general understanding,\ntext-rich documents, mathematical reasoning, and video understanding.\nIXC-2.5-Reward achieves excellent results on the latest multi-modal reward\nmodel benchmark and shows competitive performance on text-only reward model\nbenchmarks. We further demonstrate three key applications of IXC-2.5-Reward:\n(1) Providing a supervisory signal for RL training. We integrate IXC-2.5-Reward\nwith Proximal Policy Optimization (PPO) yields IXC-2.5-Chat, which shows\nconsistent improvements in instruction following and multi-modal open-ended\ndialogue; (2) Selecting the best response from candidate responses for\ntest-time scaling; and (3) Filtering outlier or noisy samples from existing\nimage and video instruction tuning training data. To ensure reproducibility and\nfacilitate further research, we have open-sourced all model weights and\ntraining recipes at https://github.com/InternLM/InternLM-XComposer\n","authors":["Yuhang Zang","Xiaoyi Dong","Pan Zhang","Yuhang Cao","Ziyu Liu","Shengyuan Ding","Shenxi Wu","Yubo Ma","Haodong Duan","Wenwei Zhang","Kai Chen","Dahua Lin","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2501.12368v1.pdf","comment":"Tech Report"},{"id":"http://arxiv.org/abs/2501.09898v2","updated":"2025-01-21T18:46:52Z","published":"2025-01-17T01:01:44Z","title":"FoundationStereo: Zero-Shot Stereo Matching","summary":" Tremendous progress has been made in deep stereo matching to excel on\nbenchmark datasets through per-domain fine-tuning. However, achieving strong\nzero-shot generalization - a hallmark of foundation models in other computer\nvision tasks - remains challenging for stereo matching. We introduce\nFoundationStereo, a foundation model for stereo depth estimation designed to\nachieve strong zero-shot generalization. To this end, we first construct a\nlarge-scale (1M stereo pairs) synthetic training dataset featuring large\ndiversity and high photorealism, followed by an automatic self-curation\npipeline to remove ambiguous samples. We then design a number of network\narchitecture components to enhance scalability, including a side-tuning feature\nbackbone that adapts rich monocular priors from vision foundation models to\nmitigate the sim-to-real gap, and long-range context reasoning for effective\ncost volume filtering. Together, these components lead to strong robustness and\naccuracy across domains, establishing a new standard in zero-shot stereo depth\nestimation. Project page: https://nvlabs.github.io/FoundationStereo/\n","authors":["Bowen Wen","Matthew Trepte","Joseph Aribido","Jan Kautz","Orazio Gallo","Stan Birchfield"],"pdf_url":"https://arxiv.org/pdf/2501.09898v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12356v1","updated":"2025-01-21T18:36:18Z","published":"2025-01-21T18:36:18Z","title":"Vision-Language Models for Automated Chest X-ray Interpretation:\n Leveraging ViT and GPT-2","summary":" Radiology plays a pivotal role in modern medicine due to its non-invasive\ndiagnostic capabilities. However, the manual generation of unstructured medical\nreports is time consuming and prone to errors. It creates a significant\nbottleneck in clinical workflows. Despite advancements in AI-generated\nradiology reports, challenges remain in achieving detailed and accurate report\ngeneration. In this study we have evaluated different combinations of\nmultimodal models that integrate Computer Vision and Natural Language\nProcessing to generate comprehensive radiology reports. We employed a\npretrained Vision Transformer (ViT-B16) and a SWIN Transformer as the image\nencoders. The BART and GPT-2 models serve as the textual decoders. We used\nChest X-ray images and reports from the IU-Xray dataset to evaluate the\nusability of the SWIN Transformer-BART, SWIN Transformer-GPT-2, ViT-B16-BART\nand ViT-B16-GPT-2 models for report generation. We aimed at finding the best\ncombination among the models. The SWIN-BART model performs as the\nbest-performing model among the four models achieving remarkable results in\nalmost all the evaluation metrics like ROUGE, BLEU and BERTScore.\n","authors":["Md. Rakibul Islam","Md. Zahid Hossain","Mustofa Ahmed","Most. Sharmin Sultana Samu"],"pdf_url":"https://arxiv.org/pdf/2501.12356v1.pdf","comment":"Preprint, manuscript under-review"},{"id":"http://arxiv.org/abs/2501.12331v1","updated":"2025-01-21T18:05:11Z","published":"2025-01-21T18:05:11Z","title":"Cinepro: Robust Training of Foundation Models for Cancer Detection in\n Prostate Ultrasound Cineloops","summary":" Prostate cancer (PCa) detection using deep learning (DL) models has shown\npotential for enhancing real-time guidance during biopsies. However, prostate\nultrasound images lack pixel-level cancer annotations, introducing label noise.\nCurrent approaches often focus on limited regions of interest (ROIs),\ndisregarding anatomical context necessary for accurate diagnosis. Foundation\nmodels can overcome this limitation by analyzing entire images to capture\nglobal spatial relationships; however, they still encounter challenges stemming\nfrom the weak labels associated with coarse pathology annotations in ultrasound\ndata. We introduce Cinepro, a novel framework that strengthens foundation\nmodels' ability to localize PCa in ultrasound cineloops. Cinepro adapts robust\ntraining by integrating the proportion of cancer tissue reported by pathology\nin a biopsy core into its loss function to address label noise, providing a\nmore nuanced supervision. Additionally, it leverages temporal data across\nmultiple frames to apply robust augmentations, enhancing the model's ability to\nlearn stable cancer-related features. Cinepro demonstrates superior performance\non a multi-center prostate ultrasound dataset, achieving an AUROC of 77.1% and\na balanced accuracy of 83.8%, surpassing current benchmarks. These findings\nunderscore Cinepro's promise in advancing foundation models for weakly labeled\nultrasound data.\n","authors":["Mohamed Harmanani","Amoon Jamzad","Minh Nguyen Nhat To","Paul F. R. Wilson","Zhuoxin Guo","Fahimeh Fooladgar","Samira Sojoudi","Mahdi Gilany","Silvia Chang","Peter Black","Michael Leveridge","Robert Siemens","Purang Abolmaesumi","Parvin Mousavi"],"pdf_url":"https://arxiv.org/pdf/2501.12331v1.pdf","comment":"accepted to IEEE ISBI 2025"},{"id":"http://arxiv.org/abs/2501.12327v1","updated":"2025-01-21T17:50:43Z","published":"2025-01-21T17:50:43Z","title":"VARGPT: Unified Understanding and Generation in a Visual Autoregressive\n Multimodal Large Language Model","summary":" We present VARGPT, a novel multimodal large language model (MLLM) that\nunifies visual understanding and generation within a single autoregressive\nframework. VARGPT employs a next-token prediction paradigm for visual\nunderstanding and a next-scale prediction paradigm for visual autoregressive\ngeneration. VARGPT innovatively extends the LLaVA architecture, achieving\nefficient scale-wise autoregressive visual generation within MLLMs while\nseamlessly accommodating mixed-modal input and output within a single model\nframework. Our VARGPT undergoes a three-stage unified training process on\nspecially curated datasets, comprising a pre-training phase and two mixed\nvisual instruction-tuning phases. The unified training strategy are designed to\nachieve alignment between visual and textual features, enhance instruction\nfollowing for both understanding and generation, and improve visual generation\nquality, respectively. Despite its LLAVA-based architecture for multimodel\nunderstanding, VARGPT significantly outperforms LLaVA-1.5 across various\nvision-centric benchmarks, such as visual question-answering and reasoning\ntasks. Notably, VARGPT naturally supports capabilities in autoregressive visual\ngeneration and instruction-to-image synthesis, showcasing its versatility in\nboth visual understanding and generation tasks. Project page is at:\n\\url{https://vargpt-1.github.io/}\n","authors":["Xianwei Zhuang","Yuxin Xie","Yufan Deng","Liming Liang","Jinghan Ru","Yuguo Yin","Yuexian Zou"],"pdf_url":"https://arxiv.org/pdf/2501.12327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12326v1","updated":"2025-01-21T17:48:10Z","published":"2025-01-21T17:48:10Z","title":"UI-TARS: Pioneering Automated GUI Interaction with Native Agents","summary":" This paper introduces UI-TARS, a native GUI agent model that solely perceives\nthe screenshots as input and performs human-like interactions (e.g., keyboard\nand mouse operations). Unlike prevailing agent frameworks that depend on\nheavily wrapped commercial models (e.g., GPT-4o) with expert-crafted prompts\nand workflows, UI-TARS is an end-to-end model that outperforms these\nsophisticated frameworks. Experiments demonstrate its superior performance:\nUI-TARS achieves SOTA performance in 10+ GUI agent benchmarks evaluating\nperception, grounding, and GUI task execution. Notably, in the OSWorld\nbenchmark, UI-TARS achieves scores of 24.6 with 50 steps and 22.7 with 15\nsteps, outperforming Claude (22.0 and 14.9 respectively). In AndroidWorld,\nUI-TARS achieves 46.6, surpassing GPT-4o (34.5). UI-TARS incorporates several\nkey innovations: (1) Enhanced Perception: leveraging a large-scale dataset of\nGUI screenshots for context-aware understanding of UI elements and precise\ncaptioning; (2) Unified Action Modeling, which standardizes actions into a\nunified space across platforms and achieves precise grounding and interaction\nthrough large-scale action traces; (3) System-2 Reasoning, which incorporates\ndeliberate reasoning into multi-step decision making, involving multiple\nreasoning patterns such as task decomposition, reflection thinking, milestone\nrecognition, etc. (4) Iterative Training with Reflective Online Traces, which\naddresses the data bottleneck by automatically collecting, filtering, and\nreflectively refining new interaction traces on hundreds of virtual machines.\nThrough iterative training and reflection tuning, UI-TARS continuously learns\nfrom its mistakes and adapts to unforeseen situations with minimal human\nintervention. We also analyze the evolution path of GUI agents to guide the\nfurther development of this domain.\n","authors":["Yujia Qin","Yining Ye","Junjie Fang","Haoming Wang","Shihao Liang","Shizuo Tian","Junda Zhang","Jiahao Li","Yunxin Li","Shijue Huang","Wanjun Zhong","Kuanye Li","Jiale Yang","Yu Miao","Woyu Lin","Longxiang Liu","Xu Jiang","Qianli Ma","Jingyu Li","Xiaojun Xiao","Kai Cai","Chuang Li","Yaowei Zheng","Chaolin Jin","Chen Li","Xiao Zhou","Minchao Wang","Haoli Chen","Zhaojian Li","Haihua Yang","Haifeng Liu","Feng Lin","Tao Peng","Xin Liu","Guang Shi"],"pdf_url":"https://arxiv.org/pdf/2501.12326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16766v2","updated":"2025-01-21T17:47:28Z","published":"2024-09-25T09:24:53Z","title":"Let There Be Light: Robust Lensless Imaging Under External Illumination\n With Deep Learning","summary":" Lensless cameras relax the design constraints of traditional cameras by\nshifting image formation from analog optics to digital post-processing. While\nnew camera designs and applications can be enabled, lensless imaging is very\nsensitive to unwanted interference (other sources, noise, etc.). In this work,\nwe address a prevalent noise source that has not been studied for lensless\nimaging: external illumination e.g. from ambient and direct lighting. Being\nrobust to a variety of lighting conditions would increase the practicality and\nadoption of lensless imaging. To this end, we propose multiple recovery\napproaches that account for external illumination by incorporating its estimate\ninto the image recovery process. At the core is a physics-based reconstruction\nthat combines learnable image recovery and denoisers, all of whose parameters\nare trained using experimentally gathered data. Compared to standard\nreconstruction methods, our approach yields significant qualitative and\nquantitative improvements. We open-source our implementations and a 25K dataset\nof measurements under multiple lighting conditions.\n","authors":["Eric Bezzam","Stefan Peters","Martin Vetterli"],"pdf_url":"https://arxiv.org/pdf/2409.16766v2.pdf","comment":"4 pages, dataset: https://doi.org/10.57967/hf/2970, accepted to\n ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.12323v1","updated":"2025-01-21T17:42:06Z","published":"2025-01-21T17:42:06Z","title":"Deep Learning Based Segmentation of Blood Vessels from H&E Stained\n Oesophageal Adenocarcinoma Whole-Slide Images","summary":" Blood vessels (BVs) play a critical role in the Tumor Micro-Environment\n(TME), potentially influencing cancer progression and treatment response.\nHowever, manually quantifying BVs in Hematoxylin and Eosin (H&E) stained images\nis challenging and labor-intensive due to their heterogeneous appearances. We\npropose a novel approach of constructing guiding maps to improve the\nperformance of state-of-the-art segmentation models for BV segmentation, the\nguiding maps encourage the models to learn representative features of BVs. This\nis particularly beneficial for computational pathology, where labeled training\ndata is often limited and large models are prone to overfitting. We have\nquantitative and qualitative results to demonstrate the efficacy of our\napproach in improving segmentation accuracy. In future, we plan to validate\nthis method to segment BVs across various tissue types and investigate the role\nof cellular structures in relation to BVs in the TME.\n","authors":["Jiaqi Lv","Stefan S Antonowicz","Shan E Ahmed Raza"],"pdf_url":"https://arxiv.org/pdf/2501.12323v1.pdf","comment":"Accepted by ISBI 2025"},{"id":"http://arxiv.org/abs/2501.12319v1","updated":"2025-01-21T17:38:55Z","published":"2025-01-21T17:38:55Z","title":"Metric for Evaluating Performance of Reference-Free Demorphing Methods","summary":" A facial morph is an image created by combining two (or more) face images\npertaining to two (or more) distinct identities. Reference-free face demorphing\ninverts the process and tries to recover the face images constituting a facial\nmorph without using any other information. However, there is no consensus on\nthe evaluation metrics to be used to evaluate and compare such demorphing\ntechniques. In this paper, we first analyze the shortcomings of the demorphing\nmetrics currently used in the literature. We then propose a new metric called\nbiometrically cross-weighted IQA that overcomes these issues and extensively\nbenchmark current methods on the proposed metric to show its efficacy.\nExperiments on three existing demorphing methods and six datasets on two\ncommonly used face matchers validate the efficacy of our proposed metric.\n","authors":["Nitish Shukla","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2501.12319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12318v1","updated":"2025-01-21T17:38:42Z","published":"2025-01-21T17:38:42Z","title":"BlanketGen2-Fit3D: Synthetic Blanket Augmentation Towards Improving\n Real-World In-Bed Blanket Occluded Human Pose Estimation","summary":" Human Pose Estimation (HPE) from monocular RGB images is crucial for clinical\nin-bed skeleton-based action recognition, however, it poses unique challenges\nfor HPE models due to the frequent presence of blankets occluding the person,\nwhile labeled HPE data in this scenario is scarce. To address this we introduce\nBlanketGen2-Fit3D (BG2-Fit3D), an augmentation of Fit3D dataset that contains\n1,217,312 frames with synthetic photo-realistic blankets. To generate it we\nused BlanketGen2, our new and improved version of our BlanketGen pipeline that\nsimulates synthetic blankets using ground-truth Skinned Multi-Person Linear\nmodel (SMPL) meshes and then renders them as transparent images that can be\nlayered on top of the original frames. This dataset was used in combination\nwith the original Fit3D to finetune the ViTPose-B HPE model, to evaluate\nsynthetic blanket augmentation effectiveness. The trained models were further\nevaluated on a real-world blanket occluded in-bed HPE dataset (SLP dataset).\nComparing architectures trained on only Fit3D with the ones trained with our\nsynthetic blanket augmentation the later improved pose estimation performance\non BG2-Fit3D, the synthetic blanket occluded dataset significantly to (0.977\nPercentage of Correct Keypoints (PCK), 0.149 Normalized Mean Error (NME)) with\nan absolute 4.4% PCK increase. Furthermore, the test results on SLP\ndemonstrated the utility of synthetic data augmentation by improving\nperformance by an absolute 2.3% PCK, on real-world images with the poses\noccluded by real blankets. These results show synthetic blanket augmentation\nhas the potential to improve in-bed blanket occluded HPE from RGB images. The\ndataset as well as the code will be made available to the public.\n","authors":["Tamás Karácsony","João Carmona","João Paulo Silva Cunha"],"pdf_url":"https://arxiv.org/pdf/2501.12318v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.14477v2","updated":"2025-01-21T17:15:10Z","published":"2024-05-23T12:06:00Z","title":"LiteVAE: Lightweight and Efficient Variational Autoencoders for Latent\n Diffusion Models","summary":" Advances in latent diffusion models (LDMs) have revolutionized\nhigh-resolution image generation, but the design space of the autoencoder that\nis central to these systems remains underexplored. In this paper, we introduce\nLiteVAE, a new autoencoder design for LDMs, which leverages the 2D discrete\nwavelet transform to enhance scalability and computational efficiency over\nstandard variational autoencoders (VAEs) with no sacrifice in output quality.\nWe investigate the training methodologies and the decoder architecture of\nLiteVAE and propose several enhancements that improve the training dynamics and\nreconstruction quality. Our base LiteVAE model matches the quality of the\nestablished VAEs in current LDMs with a six-fold reduction in encoder\nparameters, leading to faster training and lower GPU memory requirements, while\nour larger model outperforms VAEs of comparable complexity across all evaluated\nmetrics (rFID, LPIPS, PSNR, and SSIM).\n","authors":["Seyedmorteza Sadat","Jakob Buhmann","Derek Bradley","Otmar Hilliges","Romann M. Weber"],"pdf_url":"https://arxiv.org/pdf/2405.14477v2.pdf","comment":"Published as a conference paper at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.12299v1","updated":"2025-01-21T17:11:25Z","published":"2025-01-21T17:11:25Z","title":"Sublinear Variational Optimization of Gaussian Mixture Models with\n Millions to Billions of Parameters","summary":" Gaussian Mixture Models (GMMs) range among the most frequently used machine\nlearning models. However, training large, general GMMs becomes computationally\nprohibitive for datasets with many data points $N$ of high-dimensionality $D$.\nFor GMMs with arbitrary covariances, we here derive a highly efficient\nvariational approximation, which is integrated with mixtures of factor\nanalyzers (MFAs). For GMMs with $C$ components, our proposed algorithm\nsignificantly reduces runtime complexity per iteration from\n$\\mathcal{O}(NCD^2)$ to a complexity scaling linearly with $D$ and remaining\nconstant w.r.t. $C$. Numerical validation of this theoretical complexity\nreduction then shows the following: the distance evaluations required for the\nentire GMM optimization process scale sublinearly with $NC$. On large-scale\nbenchmarks, this sublinearity results in speed-ups of an order-of-magnitude\ncompared to the state-of-the-art. As a proof of concept, we train GMMs with\nover 10 billion parameters on about 100 million images, and observe training\ntimes of approximately nine hours on a single state-of-the-art CPU.\n","authors":["Sebastian Salwig","Till Kahlke","Florian Hirschberger","Dennis Forster","Jörg Lücke"],"pdf_url":"https://arxiv.org/pdf/2501.12299v1.pdf","comment":"22 pages, 6 figures (and 17 pages, 3 figures in Appendix)"},{"id":"http://arxiv.org/abs/2501.12296v1","updated":"2025-01-21T17:03:06Z","published":"2025-01-21T17:03:06Z","title":"RALAD: Bridging the Real-to-Sim Domain Gap in Autonomous Driving with\n Retrieval-Augmented Learning","summary":" In the pursuit of robust autonomous driving systems, models trained on\nreal-world datasets often struggle to adapt to new environments, particularly\nwhen confronted with corner cases such as extreme weather conditions.\nCollecting these corner cases in the real world is non-trivial, which\nnecessitates the use of simulators for validation. However,the high\ncomputational cost and the domain gap in data distribution have hindered the\nseamless transition between real and simulated driving scenarios. To tackle\nthis challenge, we propose Retrieval-Augmented Learning for Autonomous Driving\n(RALAD), a novel framework designed to bridge the real-to-sim gap at a low\ncost. RALAD features three primary designs, including (1) domain adaptation via\nan enhanced Optimal Transport (OT) method that accounts for both individual and\ngrouped image distances, (2) a simple and unified framework that can be applied\nto various models, and (3) efficient fine-tuning techniques that freeze the\ncomputationally expensive layers while maintaining robustness. Experimental\nresults demonstrate that RALAD compensates for the performance degradation in\nsimulated environments while maintaining accuracy in real-world scenarios\nacross three different models. Taking Cross View as an example, the mIOU and\nmAP metrics in real-world scenarios remain stable before and after RALAD\nfine-tuning, while in simulated environments,the mIOU and mAP metrics are\nimproved by 10.30% and 12.29%, respectively. Moreover, the re-training cost of\nour approach is reduced by approximately 88.1%. Our code is available at\nhttps://github.com/JiachengZuo/RALAD.git.\n","authors":["Jiacheng Zuo","Haibo Hu","Zikang Zhou","Yufei Cui","Ziquan Liu","Jianping Wang","Nan Guan","Jin Wang","Chun Jason Xue"],"pdf_url":"https://arxiv.org/pdf/2501.12296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12295v1","updated":"2025-01-21T17:02:51Z","published":"2025-01-21T17:02:51Z","title":"Towards Accurate Unified Anomaly Segmentation","summary":" Unsupervised anomaly detection (UAD) from images strives to model normal data\ndistributions, creating discriminative representations to distinguish and\nprecisely localize anomalies. Despite recent advancements in the efficient and\nunified one-for-all scheme, challenges persist in accurately segmenting\nanomalies for further monitoring. Moreover, this problem is obscured by the\nwidely-used AUROC metric under imbalanced UAD settings. This motivates us to\nemphasize the significance of precise segmentation of anomaly pixels using pAP\nand DSC as metrics. To address the unsolved segmentation task, we introduce the\nUnified Anomaly Segmentation (UniAS). UniAS presents a multi-level hybrid\npipeline that progressively enhances normal information from coarse to fine,\nincorporating a novel multi-granularity gated CNN (MGG-CNN) into Transformer\nlayers to explicitly aggregate local details from different granularities.\nUniAS achieves state-of-the-art anomaly segmentation performance, attaining\n65.12/59.33 and 40.06/32.50 in pAP/DSC on the MVTec-AD and VisA datasets,\nrespectively, surpassing previous methods significantly. The codes are shared\nat https://github.com/Mwxinnn/UniAS.\n","authors":["Wenxin Ma","Qingsong Yao","Xiang Zhang","Zhelong Huang","Zihang Jiang","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.12295v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.12289v1","updated":"2025-01-21T16:59:13Z","published":"2025-01-21T16:59:13Z","title":"Regressor-Guided Image Editing Regulates Emotional Response to Reduce\n Online Engagement","summary":" Emotions are known to mediate the relationship between users' content\nconsumption and their online engagement, with heightened emotional intensity\nleading to increased engagement. Building on this insight, we propose three\nregressor-guided image editing approaches aimed at diminishing the emotional\nimpact of images. These include (i) a parameter optimization approach based on\nglobal image transformations known to influence emotions, (ii) an optimization\napproach targeting the style latent space of a generative adversarial network,\nand (iii) a diffusion-based approach employing classifier guidance and\nclassifier-free guidance. Our findings demonstrate that approaches can\neffectively alter the emotional properties of images while maintaining high\nvisual quality. Optimization-based methods primarily adjust low-level\nproperties like color hues and brightness, whereas the diffusion-based approach\nintroduces semantic changes, such as altering appearance or facial expressions.\nNotably, results from a behavioral study reveal that only the diffusion-based\napproach successfully elicits changes in viewers' emotional responses while\npreserving high perceived image quality. In future work, we will investigate\nthe impact of these image adaptations on internet user behavior.\n","authors":["Christoph Gebhardt","Robin Willardt","Seyedmorteza Sadat","Chih-Wei Ning","Andreas Brombach","Jie Song","Otmar Hilliges","Christian Holz"],"pdf_url":"https://arxiv.org/pdf/2501.12289v1.pdf","comment":"39 pages, 22 figures"},{"id":"http://arxiv.org/abs/2501.12275v1","updated":"2025-01-21T16:44:51Z","published":"2025-01-21T16:44:51Z","title":"With Great Backbones Comes Great Adversarial Transferability","summary":" Advances in self-supervised learning (SSL) for machine vision have improved\nrepresentation robustness and model performance, giving rise to pre-trained\nbackbones like \\emph{ResNet} and \\emph{ViT} models tuned with SSL methods such\nas \\emph{SimCLR}. Due to the computational and data demands of pre-training,\nthe utilization of such backbones becomes a strenuous necessity. However,\nemploying these backbones may inherit vulnerabilities to adversarial attacks.\nWhile adversarial robustness has been studied under \\emph{white-box} and\n\\emph{black-box} settings, the robustness of models tuned on pre-trained\nbackbones remains largely unexplored. Additionally, the role of tuning\nmeta-information in mitigating exploitation risks is unclear. This work\nsystematically evaluates the adversarial robustness of such models across\n$20,000$ combinations of tuning meta-information, including fine-tuning\ntechniques, backbone families, datasets, and attack types. We propose using\nproxy models to transfer attacks, simulating varying levels of target knowledge\nby fine-tuning these proxies with diverse configurations. Our findings reveal\nthat proxy-based attacks approach the effectiveness of \\emph{white-box}\nmethods, even with minimal tuning knowledge. We also introduce a naive\n\"backbone attack,\" leveraging only the backbone to generate adversarial\nsamples, which outperforms \\emph{black-box} attacks and rivals \\emph{white-box}\nmethods, highlighting critical risks in model-sharing practices. Finally, our\nablations reveal how increasing tuning meta-information impacts attack\ntransferability, measuring each meta-information combination.\n","authors":["Erik Arakelyan","Karen Hambardzumyan","Davit Papikyan","Pasquale Minervini","Albert Gordo","Isabelle Augenstein","Aram H. Markosyan"],"pdf_url":"https://arxiv.org/pdf/2501.12275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12269v1","updated":"2025-01-21T16:40:44Z","published":"2025-01-21T16:40:44Z","title":"Benchmarking Image Perturbations for Testing Automated Driving\n Assistance Systems","summary":" Advanced Driver Assistance Systems (ADAS) based on deep neural networks\n(DNNs) are widely used in autonomous vehicles for critical perception tasks\nsuch as object detection, semantic segmentation, and lane recognition. However,\nthese systems are highly sensitive to input variations, such as noise and\nchanges in lighting, which can compromise their effectiveness and potentially\nlead to safety-critical failures.\n This study offers a comprehensive empirical evaluation of image\nperturbations, techniques commonly used to assess the robustness of DNNs, to\nvalidate and improve the robustness and generalization of ADAS perception\nsystems. We first conducted a systematic review of the literature, identifying\n38 categories of perturbations. Next, we evaluated their effectiveness in\nrevealing failures in two different ADAS, both at the component and at the\nsystem level. Finally, we explored the use of perturbation-based data\naugmentation and continuous learning strategies to improve ADAS adaptation to\nnew operational design domains. Our results demonstrate that all categories of\nimage perturbations successfully expose robustness issues in ADAS and that the\nuse of dataset augmentation and continuous learning significantly improves ADAS\nperformance in novel, unseen environments.\n","authors":["Stefano Carlo Lambertenghi","Hannes Leonhard","Andrea Stocco"],"pdf_url":"https://arxiv.org/pdf/2501.12269v1.pdf","comment":"Accepted for publication at the 18th IEEE International Conference on\n Software Testing, Verification and Validation (ICST 2025)"},{"id":"http://arxiv.org/abs/2408.10202v2","updated":"2025-01-21T16:39:56Z","published":"2024-08-19T17:57:28Z","title":"SANER: Annotation-free Societal Attribute Neutralizer for Debiasing CLIP","summary":" Large-scale vision-language models, such as CLIP, are known to contain\nsocietal bias regarding protected attributes (e.g., gender, age). This paper\naims to address the problems of societal bias in CLIP. Although previous\nstudies have proposed to debias societal bias through adversarial learning or\ntest-time projecting, our comprehensive study of these works identifies two\ncritical limitations: 1) loss of attribute information when it is explicitly\ndisclosed in the input and 2) use of the attribute annotations during debiasing\nprocess. To mitigate societal bias in CLIP and overcome these limitations\nsimultaneously, we introduce a simple-yet-effective debiasing method called\nSANER (societal attribute neutralizer) that eliminates attribute information\nfrom CLIP text features only of attribute-neutral descriptions. Experimental\nresults show that SANER, which does not require attribute annotations and\npreserves original information for attribute-specific descriptions,\ndemonstrates superior debiasing ability than the existing methods.\nAdditionally, we observe that SANER does not require retraining CLIP from\nscratch with the original dataset. Moreover, the debiased model can be directly\napplied to the text-to-image generation model by simply replacing the text\nencoder.\n","authors":["Yusuke Hirota","Min-Hung Chen","Chien-Yi Wang","Yuta Nakashima","Yu-Chiang Frank Wang","Ryo Hachiuma"],"pdf_url":"https://arxiv.org/pdf/2408.10202v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12267v1","updated":"2025-01-21T16:39:09Z","published":"2025-01-21T16:39:09Z","title":"VipDiff: Towards Coherent and Diverse Video Inpainting via Training-free\n Denoising Diffusion Models","summary":" Recent video inpainting methods have achieved encouraging improvements by\nleveraging optical flow to guide pixel propagation from reference frames either\nin the image space or feature space. However, they would produce severe\nartifacts in the mask center when the masked area is too large and no pixel\ncorrespondences can be found for the center. Recently, diffusion models have\ndemonstrated impressive performance in generating diverse and high-quality\nimages, and have been exploited in a number of works for image inpainting.\nThese methods, however, cannot be applied directly to videos to produce\ntemporal-coherent inpainting results. In this paper, we propose a training-free\nframework, named VipDiff, for conditioning diffusion model on the reverse\ndiffusion process to produce temporal-coherent inpainting results without\nrequiring any training data or fine-tuning the pre-trained diffusion models.\nVipDiff takes optical flow as guidance to extract valid pixels from reference\nframes to serve as constraints in optimizing the randomly sampled Gaussian\nnoise, and uses the generated results for further pixel propagation and\nconditional generation. VipDiff also allows for generating diverse video\ninpainting results over different sampled noise. Experiments demonstrate that\nVipDiff can largely outperform state-of-the-art video inpainting methods in\nterms of both spatial-temporal coherence and fidelity.\n","authors":["Chaohao Xie","Kai Han","Kwan-Yee K. Wong"],"pdf_url":"https://arxiv.org/pdf/2501.12267v1.pdf","comment":"10 pages, 5 Figures (Accepted at WACV 2025)"},{"id":"http://arxiv.org/abs/2501.12266v1","updated":"2025-01-21T16:38:04Z","published":"2025-01-21T16:38:04Z","title":"CBVLM: Training-free Explainable Concept-based Large Vision Language\n Models for Medical Image Classification","summary":" The main challenges limiting the adoption of deep learning-based solutions in\nmedical workflows are the availability of annotated data and the lack of\ninterpretability of such systems. Concept Bottleneck Models (CBMs) tackle the\nlatter by constraining the final disease prediction on a set of predefined and\nhuman-interpretable concepts. However, the increased interpretability achieved\nthrough these concept-based explanations implies a higher annotation burden.\nMoreover, if a new concept needs to be added, the whole system needs to be\nretrained. Inspired by the remarkable performance shown by Large\nVision-Language Models (LVLMs) in few-shot settings, we propose a simple, yet\neffective, methodology, CBVLM, which tackles both of the aforementioned\nchallenges. First, for each concept, we prompt the LVLM to answer if the\nconcept is present in the input image. Then, we ask the LVLM to classify the\nimage based on the previous concept predictions. Moreover, in both stages, we\nincorporate a retrieval module responsible for selecting the best examples for\nin-context learning. By grounding the final diagnosis on the predicted\nconcepts, we ensure explainability, and by leveraging the few-shot capabilities\nof LVLMs, we drastically lower the annotation cost. We validate our approach\nwith extensive experiments across four medical datasets and twelve LVLMs (both\ngeneric and medical) and show that CBVLM consistently outperforms CBMs and\ntask-specific supervised methods without requiring any training and using just\na few annotated examples. More information on our project page:\nhttps://cristianopatricio.github.io/CBVLM/.\n","authors":["Cristiano Patrício","Isabel Rio-Torto","Jaime S. Cardoso","Luís F. Teixeira","João C. Neves"],"pdf_url":"https://arxiv.org/pdf/2501.12266v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2501.12263v1","updated":"2025-01-21T16:34:16Z","published":"2025-01-21T16:34:16Z","title":"mmCooper: A Multi-agent Multi-stage Communication-efficient and\n Collaboration-robust Cooperative Perception Framework","summary":" Collaborative perception significantly enhances individual vehicle perception\nperformance through the exchange of sensory information among agents. However,\nreal-world deployment faces challenges due to bandwidth constraints and\ninevitable calibration errors during information exchange. To address these\nissues, we propose mmCooper, a novel multi-agent, multi-stage,\ncommunication-efficient, and collaboration-robust cooperative perception\nframework. Our framework leverages a multi-stage collaboration strategy that\ndynamically and adaptively balances intermediate- and late-stage information to\nshare among agents, enhancing perceptual performance while maintaining\ncommunication efficiency. To support robust collaboration despite potential\nmisalignments and calibration errors, our framework captures multi-scale\ncontextual information for robust fusion in the intermediate stage and\ncalibrates the received detection results to improve accuracy in the late\nstage. We validate the effectiveness of mmCooper through extensive experiments\non real-world and simulated datasets. The results demonstrate the superiority\nof our proposed framework and the effectiveness of each component.\n","authors":["Bingyi Liu","Jian Teng","Hongfei Xue","Enshu Wang","Chuanhui Zhu","Pu Wang","Libing Wu"],"pdf_url":"https://arxiv.org/pdf/2501.12263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12255v1","updated":"2025-01-21T16:23:05Z","published":"2025-01-21T16:23:05Z","title":"HAC++: Towards 100X Compression of 3D Gaussian Splatting","summary":" 3D Gaussian Splatting (3DGS) has emerged as a promising framework for novel\nview synthesis, boasting rapid rendering speed with high fidelity. However, the\nsubstantial Gaussians and their associated attributes necessitate effective\ncompression techniques. Nevertheless, the sparse and unorganized nature of the\npoint cloud of Gaussians (or anchors in our paper) presents challenges for\ncompression. To achieve a compact size, we propose HAC++, which leverages the\nrelationships between unorganized anchors and a structured hash grid, utilizing\ntheir mutual information for context modeling. Additionally, HAC++ captures\nintra-anchor contextual relationships to further enhance compression\nperformance. To facilitate entropy coding, we utilize Gaussian distributions to\nprecisely estimate the probability of each quantized attribute, where an\nadaptive quantization module is proposed to enable high-precision quantization\nof these attributes for improved fidelity restoration. Moreover, we incorporate\nan adaptive masking strategy to eliminate invalid Gaussians and anchors.\nOverall, HAC++ achieves a remarkable size reduction of over 100X compared to\nvanilla 3DGS when averaged on all datasets, while simultaneously improving\nfidelity. It also delivers more than 20X size reduction compared to\nScaffold-GS. Our code is available at\nhttps://github.com/YihangChen-ee/HAC-plus.\n","authors":["Yihang Chen","Qianyi Wu","Weiyao Lin","Mehrtash Harandi","Jianfei Cai"],"pdf_url":"https://arxiv.org/pdf/2501.12255v1.pdf","comment":"IEEE TPAMI Submission. This paper is an extension of HAC at\n arXiv:2403.14530 (ECCV 2024)"},{"id":"http://arxiv.org/abs/2501.12254v1","updated":"2025-01-21T16:19:38Z","published":"2025-01-21T16:19:38Z","title":"Memory Storyboard: Leveraging Temporal Segmentation for Streaming\n Self-Supervised Learning from Egocentric Videos","summary":" Self-supervised learning holds the promise to learn good representations from\nreal-world continuous uncurated data streams. However, most existing works in\nvisual self-supervised learning focus on static images or artificial data\nstreams. Towards exploring a more realistic learning substrate, we investigate\nstreaming self-supervised learning from long-form real-world egocentric video\nstreams. Inspired by the event segmentation mechanism in human perception and\nmemory, we propose \"Memory Storyboard\" that groups recent past frames into\ntemporal segments for more effective summarization of the past visual streams\nfor memory replay. To accommodate efficient temporal segmentation, we propose a\ntwo-tier memory hierarchy: the recent past is stored in a short-term memory,\nand the storyboard temporal segments are then transferred to a long-term\nmemory. Experiments on real-world egocentric video datasets including SAYCam\nand KrishnaCam show that contrastive learning objectives on top of storyboard\nframes result in semantically meaningful representations which outperform those\nproduced by state-of-the-art unsupervised continual learning methods.\n","authors":["Yanlai Yang","Mengye Ren"],"pdf_url":"https://arxiv.org/pdf/2501.12254v1.pdf","comment":"20 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.05884v2","updated":"2025-01-21T16:11:44Z","published":"2024-11-08T10:05:14Z","title":"Untrained Perceptual Loss for image denoising of line-like structures in\n MR images","summary":" In the acquisition of Magnetic Resonance (MR) images shorter scan times lead\nto higher image noise. Therefore, automatic image denoising using deep learning\nmethods is of high interest. MR images containing line-like structures such as\nroots or vessels yield special characteristics as they display connected\nstructures and yield sparse information. For this kind of data, it is important\nto consider voxel neighborhoods when training a denoising network. In this\npaper, we translate the Perceptual Loss to 3D data by comparing feature maps of\nuntrained networks in the loss function as done previously for 2D data. We\ntested the performance of untrained Perceptual Loss (uPL) on 3D image denoising\nof MR images displaying brain vessels (MR angiograms - MRA) and images of plant\nroots in soil. We investigate the impact of various uPL characteristics such as\nweight initialization, network depth, kernel size, and pooling operations on\nthe results. We tested the performance of the uPL loss on four Rician noise\nlevels using evaluation metrics such as the Structural Similarity Index Metric\n(SSIM). We observe, that our uPL outperforms conventional loss functions such\nas the L1 loss or a loss based on the Structural Similarity Index Metric\n(SSIM). The uPL network's initialization is not important, while network depth\nand pooling operations impact denoising performance. E.g. for both datasets a\nnetwork with five convolutional layers led to the best performance while a\nnetwork with more layers led to a performance drop. We also find that small uPL\nnetworks led to better or comparable results than using large networks such as\nVGG. We observe superior performance of our loss for both datasets, all noise\nlevels, and three network architectures. In conclusion, for images containing\nline-like structures, uPL is an alternative to other loss functions for 3D\nimage denoising.\n","authors":["Elisabeth Pfaehler","Daniel Pflugfelder","Hanno Scharr"],"pdf_url":"https://arxiv.org/pdf/2411.05884v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12246v1","updated":"2025-01-21T16:07:32Z","published":"2025-01-21T16:07:32Z","title":"Video Deblurring by Sharpness Prior Detection and Edge Information","summary":" Video deblurring is essential task for autonomous driving, facial\nrecognition, and security surveillance. Traditional methods directly estimate\nmotion blur kernels, often introducing artifacts and leading to poor results.\nRecent approaches utilize the detection of sharp frames within video sequences\nto enhance deblurring. However, existing datasets rely on fixed number of sharp\nframes, which may be too restrictive for some applications and may introduce a\nbias during model training. To address these limitations and enhance domain\nadaptability, this work first introduces GoPro Random Sharp (GoProRS), a new\ndataset where the the frequency of sharp frames within the sequence is\ncustomizable, allowing more diverse training and testing scenarios.\nFurthermore, it presents a novel video deblurring model, called SPEINet, that\nintegrates sharp frame features into blurry frame reconstruction through an\nattention-based encoder-decoder architecture, a lightweight yet robust sharp\nframe detection and an edge extraction phase. Extensive experimental results\ndemonstrate that SPEINet outperforms state-of-the-art methods across multiple\ndatasets, achieving an average of +3.2% PSNR improvement over recent\ntechniques. Given such promising results, we believe that both the proposed\nmodel and dataset pave the way for future advancements in video deblurring\nbased on the detection of sharp frames.\n","authors":["Yang Tian","Fabio Brau","Giulio Rossolini","Giorgio Buttazzo","Hao Meng"],"pdf_url":"https://arxiv.org/pdf/2501.12246v1.pdf","comment":"Under review in Pattern Recognition"},{"id":"http://arxiv.org/abs/2501.12245v1","updated":"2025-01-21T16:04:53Z","published":"2025-01-21T16:04:53Z","title":"Quality Enhancement of Radiographic X-ray Images by Interpretable\n Mapping","summary":" X-ray imaging is the most widely used medical imaging modality. However, in\nthe common practice, inconsistency in the initial presentation of X-ray images\nis a common complaint by radiologists. Different patient positions, patient\nhabitus and scanning protocols can lead to differences in image presentations,\ne.g., differences in brightness and contrast globally or regionally. To\ncompensate for this, additional work will be executed by clinical experts to\nadjust the images to the desired presentation, which can be time-consuming.\nExisting deep-learning-based end-to-end solutions can automatically correct\nimages with promising performances. Nevertheless, these methods are hard to be\ninterpreted and difficult to be understood by clinical experts. In this\nmanuscript, a novel interpretable mapping method by deep learning is proposed,\nwhich automatically enhances the image brightness and contrast globally and\nlocally. Meanwhile, because the model is inspired by the workflow of the\nbrightness and contrast manipulation, it can provide interpretable pixel maps\nfor explaining the motivation of image enhancement. The experiment on the\nclinical datasets show the proposed method can provide consistent brightness\nand contrast correction on X-ray images with accuracy of 24.75 dB PSNR and\n0.8431 SSIM.\n","authors":["Hongxu Yang","Najib Akram Aboobacker","Xiaomeng Dong","German Gonzalez","Lehel Ferenczi","Gopal Avinash"],"pdf_url":"https://arxiv.org/pdf/2501.12245v1.pdf","comment":"SPIE Medical Imaging 2025"},{"id":"http://arxiv.org/abs/2501.12244v1","updated":"2025-01-21T16:04:39Z","published":"2025-01-21T16:04:39Z","title":"Zero-shot Bias Correction: Efficient MR Image Inhomogeneity Reduction\n Without Any Data","summary":" In recent years, deep neural networks for image inhomogeneity reduction have\nshown promising results. However, current methods with (un)supervised solutions\nrequire preparing a training dataset, which is expensive and laborious for data\ncollection. In this work, we demonstrate a novel zero-shot deep neural\nnetworks, which requires no data for pre-training and dedicated assumption of\nthe bias field. The designed light-weight CNN enables an efficient zero-shot\nadaptation for bias-corrupted image correction. Our method provides a novel\nsolution to mitigate the biased corrupted image as iterative homogeneity\nrefinement, which therefore ensures the considered issue can be solved easier\nwith stable convergence of zero-shot optimization. Extensive comparison on\ndifferent datasets show that the proposed method performs better than current\ndata-free N4 methods in both efficiency and accuracy.\n","authors":["Hongxu Yang","Edina Timko","Brice Fernandez"],"pdf_url":"https://arxiv.org/pdf/2501.12244v1.pdf","comment":"Accepted by ISBI 2025. Supported by IHI PREDICTOM Project"},{"id":"http://arxiv.org/abs/2501.12239v1","updated":"2025-01-21T15:59:21Z","published":"2025-01-21T15:59:21Z","title":"Investigating Market Strength Prediction with CNNs on Candlestick Chart\n Images","summary":" This paper investigates predicting market strength solely from candlestick\nchart images to assist investment decisions. The core research problem is\ndeveloping an effective computer vision-based model using raw candlestick\nvisuals without time-series data. We specifically analyze the impact of\nincorporating candlestick patterns that were detected by YOLOv8. The study\nimplements two approaches: pure CNN on chart images and a Decomposer\narchitecture detecting patterns. Experiments utilize diverse financial datasets\nspanning stocks, cryptocurrencies, and forex assets. Key findings demonstrate\ncandlestick patterns do not improve model performance over only image data in\nour research. The significance is illuminating limitations in candlestick image\nsignals. Performance peaked at approximately 0.7 accuracy, below more complex\ntime-series models. Outcomes reveal challenges in distilling sufficient\npredictive power from visual shapes alone, motivating the incorporation of\nother data modalities. This research clarifies how purely image-based models\ncan inform trading while confirming patterns add little value over raw charts.\nOur content is endeavored to be delineated into distinct sections, each\nautonomously furnishing a unique contribution while maintaining cohesive\nlinkage. Note that, the examples discussed herein are not limited to the scope,\napplicability, or knowledge outlined in the paper.\n","authors":["Thanh Nam Duong","Trung Kien Hoang","Quoc Khanh Duong","Quoc Dat Dinh","Duc Hoan Le","Huy Tuan Nguyen","Xuan Bach Nguyen","Quy Ban Tran"],"pdf_url":"https://arxiv.org/pdf/2501.12239v1.pdf","comment":"ACMLC 2025; 8 pages"},{"id":"http://arxiv.org/abs/2501.12235v1","updated":"2025-01-21T15:58:16Z","published":"2025-01-21T15:58:16Z","title":"DLEN: Dual Branch of Transformer for Low-Light Image Enhancement in Dual\n Domains","summary":" Low-light image enhancement (LLE) aims to improve the visual quality of\nimages captured in poorly lit conditions, which often suffer from low\nbrightness, low contrast, noise, and color distortions. These issues hinder the\nperformance of computer vision tasks such as object detection, facial\nrecognition, and autonomous driving.Traditional enhancement techniques, such as\nmulti-scale fusion and histogram equalization, fail to preserve fine details\nand often struggle with maintaining the natural appearance of enhanced images\nunder complex lighting conditions. Although the Retinex theory provides a\nfoundation for image decomposition, it often amplifies noise, leading to\nsuboptimal image quality. In this paper, we propose the Dual Light Enhance\nNetwork (DLEN), a novel architecture that incorporates two distinct attention\nmechanisms, considering both spatial and frequency domains. Our model\nintroduces a learnable wavelet transform module in the illumination estimation\nphase, preserving high- and low-frequency components to enhance edge and\ntexture details. Additionally, we design a dual-branch structure that leverages\nthe power of the Transformer architecture to enhance both the illumination and\nstructural components of the image.Through extensive experiments, our model\noutperforms state-of-the-art methods on standard benchmarks.Code is available\nhere: https://github.com/LaLaLoXX/DLEN\n","authors":["Junyu Xia","Jiesong Bai","Yihang Dong"],"pdf_url":"https://arxiv.org/pdf/2501.12235v1.pdf","comment":"10pages,6figures"},{"id":"http://arxiv.org/abs/2501.12231v1","updated":"2025-01-21T15:55:06Z","published":"2025-01-21T15:55:06Z","title":"InsTALL: Context-aware Instructional Task Assistance with Multi-modal\n Large Language Models","summary":" The improved competence of generative models can help building multi-modal\nvirtual assistants that leverage modalities beyond language. By observing\nhumans performing multi-step tasks, one can build assistants that have\nsituational awareness of actions and tasks being performed, enabling them to\ncater assistance based on this understanding. In this paper, we develop a\nContext-aware Instructional Task Assistant with Multi-modal Large Language\nModels (InsTALL) that leverages an online visual stream (e.g. a user's screen\nshare or video recording) and responds in real-time to user queries related to\nthe task at hand. To enable useful assistance, InsTALL 1) trains a multi-modal\nmodel on task videos and paired textual data, and 2) automatically extracts\ntask graph from video data and leverages it at training and inference time. We\nshow InsTALL achieves state-of-the-art performance across proposed sub-tasks\nconsidered for multimodal activity understanding -- task recognition (TR),\naction recognition (AR), next action prediction (AP), and plan prediction (PP)\n-- and outperforms existing baselines on two novel sub-tasks related to\nautomatic error identification.\n","authors":["Pha Nguyen","Sailik Sengupta","Girik Malik","Arshit Gupta","Bonan Min"],"pdf_url":"https://arxiv.org/pdf/2501.12231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12224v1","updated":"2025-01-21T15:49:29Z","published":"2025-01-21T15:49:29Z","title":"TokenVerse: Versatile Multi-concept Personalization in Token Modulation\n Space","summary":" We present TokenVerse -- a method for multi-concept personalization,\nleveraging a pre-trained text-to-image diffusion model. Our framework can\ndisentangle complex visual elements and attributes from as little as a single\nimage, while enabling seamless plug-and-play generation of combinations of\nconcepts extracted from multiple images. As opposed to existing works,\nTokenVerse can handle multiple images with multiple concepts each, and supports\na wide-range of concepts, including objects, accessories, materials, pose, and\nlighting. Our work exploits a DiT-based text-to-image model, in which the input\ntext affects the generation through both attention and modulation (shift and\nscale). We observe that the modulation space is semantic and enables localized\ncontrol over complex concepts. Building on this insight, we devise an\noptimization-based framework that takes as input an image and a text\ndescription, and finds for each word a distinct direction in the modulation\nspace. These directions can then be used to generate new images that combine\nthe learned concepts in a desired configuration. We demonstrate the\neffectiveness of TokenVerse in challenging personalization settings, and\nshowcase its advantages over existing methods. project's webpage in\nhttps://token-verse.github.io/\n","authors":["Daniel Garibi","Shahar Yadin","Roni Paiss","Omer Tov","Shiran Zada","Ariel Ephrat","Tomer Michaeli","Inbar Mosseri","Tali Dekel"],"pdf_url":"https://arxiv.org/pdf/2501.12224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12218v1","updated":"2025-01-21T15:39:40Z","published":"2025-01-21T15:39:40Z","title":"Exploring Temporally-Aware Features for Point Tracking","summary":" Point tracking in videos is a fundamental task with applications in robotics,\nvideo editing, and more. While many vision tasks benefit from pre-trained\nfeature backbones to improve generalizability, point tracking has primarily\nrelied on simpler backbones trained from scratch on synthetic data, which may\nlimit robustness in real-world scenarios. Additionally, point tracking requires\ntemporal awareness to ensure coherence across frames, but using\ntemporally-aware features is still underexplored. Most current methods often\nemploy a two-stage process: an initial coarse prediction followed by a\nrefinement stage to inject temporal information and correct errors from the\ncoarse stage. These approach, however, is computationally expensive and\npotentially redundant if the feature backbone itself captures sufficient\ntemporal information.\n In this work, we introduce Chrono, a feature backbone specifically designed\nfor point tracking with built-in temporal awareness. Leveraging pre-trained\nrepresentations from self-supervised learner DINOv2 and enhanced with a\ntemporal adapter, Chrono effectively captures long-term temporal context,\nenabling precise prediction even without the refinement stage. Experimental\nresults demonstrate that Chrono achieves state-of-the-art performance in a\nrefiner-free setting on the TAP-Vid-DAVIS and TAP-Vid-Kinetics datasets, among\ncommon feature backbones used in point tracking as well as DINOv2, with\nexceptional efficiency. Project page: https://cvlab-kaist.github.io/Chrono/\n","authors":["Inès Hyeonsu Kim","Seokju Cho","Jiahui Huang","Jung Yi","Joon-Young Lee","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2501.12218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12217v1","updated":"2025-01-21T15:39:29Z","published":"2025-01-21T15:39:29Z","title":"Early Detection and Classification of Breast Cancer Using Deep Learning\n Techniques","summary":" Breast cancer is one of the deadliest cancers causing about massive number of\npatients to die annually all over the world according to the WHO. It is a kind\nof cancer that develops when the tissues of the breast grow rapidly and\nunboundly. This fatality rate can be prevented if the cancer is detected before\nit gets malignant. Using automation for early-age detection of breast cancer,\nArtificial Intelligence and Machine Learning technologies can be implemented\nfor the best outcome. In this study, we are using the Breast Cancer Image\nClassification dataset collected from the Kaggle depository, which comprises\n9248 Breast Ultrasound Images and is classified into three categories: Benign,\nMalignant, and Normal which refers to non-cancerous, cancerous, and normal\nimages.This research introduces three pretrained model featuring custom\nclassifiers that includes ResNet50, MobileNet, and VGG16, along with a custom\nCNN model utilizing the ReLU activation function.The models ResNet50,\nMobileNet, VGG16, and a custom CNN recorded accuracies of 98.41%, 97.91%,\n98.19%, and 92.94% on the dataset, correspondingly, with ResNet50 achieving the\nhighest accuracy of 98.41%.This model, with its deep and powerful architecture,\nis particularly successful in detecting aberrant cells as well as cancerous or\nnon-cancerous tumors. These accuracies show that the Machine Learning methods\nare more compatible for the classification and early detection of breast\ncancer.\n","authors":["Mst. Mumtahina Labonno","D. M. Asadujjaman","Md. Mahfujur Rahman","Abdullah Tamim","Mst. Jannatul Ferdous","Rafi Muttaki Mahi"],"pdf_url":"https://arxiv.org/pdf/2501.12217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01957v3","updated":"2025-01-21T15:36:41Z","published":"2025-01-03T18:59:52Z","title":"VITA-1.5: Towards GPT-4o Level Real-Time Vision and Speech Interaction","summary":" Recent Multimodal Large Language Models (MLLMs) have typically focused on\nintegrating visual and textual modalities, with less emphasis placed on the\nrole of speech in enhancing interaction. However, speech plays a crucial role\nin multimodal dialogue systems, and implementing high-performance in both\nvision and speech tasks remains a significant challenge due to the fundamental\nmodality differences. In this paper, we propose a carefully designed\nmulti-stage training methodology that progressively trains LLM to understand\nboth visual and speech information, ultimately enabling fluent vision and\nspeech interaction. Our approach not only preserves strong vision-language\ncapacity, but also enables efficient speech-to-speech dialogue capabilities\nwithout separate ASR and TTS modules, significantly accelerating multimodal\nend-to-end response speed. By comparing our method against state-of-the-art\ncounterparts across benchmarks for image, video, and speech tasks, we\ndemonstrate that our model is equipped with both strong visual and speech\ncapabilities, making near real-time vision and speech interaction.\n","authors":["Chaoyou Fu","Haojia Lin","Xiong Wang","Yi-Fan Zhang","Yunhang Shen","Xiaoyu Liu","Haoyu Cao","Zuwei Long","Heting Gao","Ke Li","Long Ma","Xiawu Zheng","Rongrong Ji","Xing Sun","Caifeng Shan","Ran He"],"pdf_url":"https://arxiv.org/pdf/2501.01957v3.pdf","comment":"https://github.com/VITA-MLLM/VITA (2K+ Stars by now)"},{"id":"http://arxiv.org/abs/2501.12216v1","updated":"2025-01-21T15:36:08Z","published":"2025-01-21T15:36:08Z","title":"RL-RC-DoT: A Block-level RL agent for Task-Aware Video Compression","summary":" Video encoders optimize compression for human perception by minimizing\nreconstruction error under bit-rate constraints. In many modern applications\nsuch as autonomous driving, an overwhelming majority of videos serve as input\nfor AI systems performing tasks like object recognition or segmentation, rather\nthan being watched by humans. It is therefore useful to optimize the encoder\nfor a downstream task instead of for perceptual image quality. However, a major\nchallenge is how to combine such downstream optimization with existing standard\nvideo encoders, which are highly efficient and popular. Here, we address this\nchallenge by controlling the Quantization Parameters (QPs) at the macro-block\nlevel to optimize the downstream task. This granular control allows us to\nprioritize encoding for task-relevant regions within each frame. We formulate\nthis optimization problem as a Reinforcement Learning (RL) task, where the\nagent learns to balance long-term implications of choosing QPs on both task\nperformance and bit-rate constraints. Notably, our policy does not require the\ndownstream task as an input during inference, making it suitable for streaming\napplications and edge devices such as vehicles. We demonstrate significant\nimprovements in two tasks, car detection, and ROI (saliency) encoding. Our\napproach improves task performance for a given bit rate compared to traditional\ntask agnostic encoding methods, paving the way for more efficient task-aware\nvideo compression.\n","authors":["Uri Gadot","Assaf Shocher","Shie Mannor","Gal Chechik","Assaf Hallak"],"pdf_url":"https://arxiv.org/pdf/2501.12216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16640v2","updated":"2025-01-21T15:25:51Z","published":"2024-03-25T11:28:52Z","title":"Multi-Scale Texture Loss for CT denoising with GANs","summary":" Generative Adversarial Networks (GANs) have proved as a powerful framework\nfor denoising applications in medical imaging. However, GAN-based denoising\nalgorithms still suffer from limitations in capturing complex relationships\nwithin the images. In this regard, the loss function plays a crucial role in\nguiding the image generation process, encompassing how much a synthetic image\ndiffers from a real image. To grasp highly complex and non-linear textural\nrelationships in the training process, this work presents a novel approach to\ncapture and embed multi-scale texture information into the loss function. Our\nmethod introduces a differentiable multi-scale texture representation of the\nimages dynamically aggregated by a self-attention layer, thus exploiting\nend-to-end gradient-based optimization. We validate our approach by carrying\nout extensive experiments in the context of low-dose CT denoising, a\nchallenging application that aims to enhance the quality of noisy CT scans. We\nutilize three publicly available datasets, including one simulated and two real\ndatasets. The results are promising as compared to other well-established loss\nfunctions, being also consistent across three different GAN architectures. The\ncode is available at:\nhttps://github.com/TrainLaboratory/MultiScaleTextureLoss-MSTLF\n","authors":["Francesco Di Feola","Lorenzo Tronchin","Valerio Guarrasi","Paolo Soda"],"pdf_url":"https://arxiv.org/pdf/2403.16640v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12206v1","updated":"2025-01-21T15:22:31Z","published":"2025-01-21T15:22:31Z","title":"Fixing Imbalanced Attention to Mitigate In-Context Hallucination of\n Large Vision-Language Model","summary":" Large Vision Language Models (LVLMs) have demonstrated remarkable\ncapabilities in understanding and describing visual content, achieving\nstate-of-the-art performance across various vision-language tasks. However,\nthese models frequently exhibit hallucination behavior, where they generate\ndescriptions containing objects or details absent in the input image. Our work\ninvestigates this phenomenon by analyzing attention patterns across transformer\nlayers and heads, revealing that hallucinations often stem from progressive\ndegradation of visual grounding in deeper layers. We propose a novel attention\nmodification approach that combines selective token emphasis and head-specific\nmodulation to maintain visual grounding throughout the generation process. Our\nmethod introduces two key components: (1) a dual-stream token selection\nmechanism that identifies and prioritizes both locally informative and\nspatially significant visual tokens, and (2) an attention head-specific\nmodulation strategy that differentially amplifies visual information processing\nbased on measured visual sensitivity of individual attention heads. Through\nextensive experimentation on the MSCOCO dataset, we demonstrate that our\napproach reduces hallucination rates by up to 62.3\\% compared to baseline\nmodels while maintaining comparable task performance. Our analysis reveals that\nselectively modulating tokens across attention heads with varying levels of\nvisual sensitivity can significantly improve visual grounding without requiring\nmodel retraining.\n","authors":["Kazi Hasan Ibn Arif","Sajib Acharjee Dip","Khizar Hussain","Lang Zhang","Chris Thomas"],"pdf_url":"https://arxiv.org/pdf/2501.12206v1.pdf","comment":"10 pages, 5 tables, 4 figures"},{"id":"http://arxiv.org/abs/2501.12203v1","updated":"2025-01-21T15:18:55Z","published":"2025-01-21T15:18:55Z","title":"Explainability for Vision Foundation Models: A Survey","summary":" As artificial intelligence systems become increasingly integrated into daily\nlife, the field of explainability has gained significant attention. This trend\nis particularly driven by the complexity of modern AI models and their\ndecision-making processes. The advent of foundation models, characterized by\ntheir extensive generalization capabilities and emergent uses, has further\ncomplicated this landscape. Foundation models occupy an ambiguous position in\nthe explainability domain: their complexity makes them inherently challenging\nto interpret, yet they are increasingly leveraged as tools to construct\nexplainable models. In this survey, we explore the intersection of foundation\nmodels and eXplainable AI (XAI) in the vision domain. We begin by compiling a\ncomprehensive corpus of papers that bridge these fields. Next, we categorize\nthese works based on their architectural characteristics. We then discuss the\nchallenges faced by current research in integrating XAI within foundation\nmodels. Furthermore, we review common evaluation methodologies for these\ncombined approaches. Finally, we present key observations and insights from our\nsurvey, offering directions for future research in this rapidly evolving field.\n","authors":["Rémi Kazmierczak","Eloïse Berthier","Goran Frehse","Gianni Franchi"],"pdf_url":"https://arxiv.org/pdf/2501.12203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.18675v3","updated":"2025-01-21T15:16:59Z","published":"2024-12-24T20:28:07Z","title":"TAB: Transformer Attention Bottlenecks enable User Intervention and\n Debugging in Vision-Language Models","summary":" Multi-head self-attention (MHSA) is a key component of Transformers, a widely\npopular architecture in both language and vision. Multiple heads intuitively\nenable different parallel processes over the same input. Yet, they also obscure\nthe attribution of each input patch to the output of a model. We propose a\nnovel 1-head Transformer Attention Bottleneck (TAB) layer, inserted after the\ntraditional MHSA architecture, to serve as an attention bottleneck for\ninterpretability and intervention. Unlike standard self-attention, TAB\nconstrains the total attention over all patches to $\\in [0, 1]$. That is, when\nthe total attention is 0, no visual information is propagated further into the\nnetwork and the vision-language model (VLM) would default to a generic,\nimage-independent response. To demonstrate the advantages of TAB, we train VLMs\nwith TAB to perform image difference captioning. Over three datasets, our\nmodels perform similarly to baseline VLMs in captioning but the bottleneck is\nsuperior in localizing changes and in identifying when no changes occur. TAB is\nthe first architecture to enable users to intervene by editing attention, which\noften produces expected outputs by VLMs.\n","authors":["Pooyan Rahmanzadehgervi","Hung Huy Nguyen","Rosanne Liu","Long Mai","Anh Totti Nguyen"],"pdf_url":"https://arxiv.org/pdf/2412.18675v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12202v1","updated":"2025-01-21T15:16:54Z","published":"2025-01-21T15:16:54Z","title":"Hunyuan3D 2.0: Scaling Diffusion Models for High Resolution Textured 3D\n Assets Generation","summary":" We present Hunyuan3D 2.0, an advanced large-scale 3D synthesis system for\ngenerating high-resolution textured 3D assets. This system includes two\nfoundation components: a large-scale shape generation model -- Hunyuan3D-DiT,\nand a large-scale texture synthesis model -- Hunyuan3D-Paint. The shape\ngenerative model, built on a scalable flow-based diffusion transformer, aims to\ncreate geometry that properly aligns with a given condition image, laying a\nsolid foundation for downstream applications. The texture synthesis model,\nbenefiting from strong geometric and diffusion priors, produces high-resolution\nand vibrant texture maps for either generated or hand-crafted meshes.\nFurthermore, we build Hunyuan3D-Studio -- a versatile, user-friendly production\nplatform that simplifies the re-creation process of 3D assets. It allows both\nprofessional and amateur users to manipulate or even animate their meshes\nefficiently. We systematically evaluate our models, showing that Hunyuan3D 2.0\noutperforms previous state-of-the-art models, including the open-source models\nand closed-source models in geometry details, condition alignment, texture\nquality, and etc. Hunyuan3D 2.0 is publicly released in order to fill the gaps\nin the open-source 3D community for large-scale foundation generative models.\nThe code and pre-trained weights of our models are available at:\nhttps://github.com/Tencent/Hunyuan3D-2\n","authors":["Zibo Zhao","Zeqiang Lai","Qingxiang Lin","Yunfei Zhao","Haolin Liu","Shuhui Yang","Yifei Feng","Mingxin Yang","Sheng Zhang","Xianghui Yang","Huiwen Shi","Sicong Liu","Junta Wu","Yihang Lian","Fan Yang","Ruining Tang","Zebin He","Xinzhou Wang","Jian Liu","Xuhui Zuo","Zhuo Chen","Biwen Lei","Haohan Weng","Jing Xu","Yiling Zhu","Xinhai Liu","Lixin Xu","Changrong Hu","Tianyu Huang","Lifu Wang","Jihong Zhang","Meng Chen","Liang Dong","Yiwen Jia","Yulin Cai","Jiaao Yu","Yixuan Tang","Hao Zhang","Zheng Ye","Peng He","Runzhou Wu","Chao Zhang","Yonghao Tan","Jie Xiao","Yangyu Tao","Jianchen Zhu","Jinbao Xue","Kai Liu","Chongqing Zhao","Xinming Wu","Zhichao Hu","Lei Qin","Jianbing Peng","Zhan Li","Minghui Chen","Xipeng Zhang","Lin Niu","Paige Wang","Yingkai Wang","Haozhao Kuang","Zhongyi Fan","Xu Zheng","Weihao Zhuang","YingPing He","Tian Liu","Yong Yang","Di Wang","Yuhong Liu","Jie Jiang","Jingwei Huang","Chunchao Guo"],"pdf_url":"https://arxiv.org/pdf/2501.12202v1.pdf","comment":"GitHub link: https://github.com/Tencent/Hunyuan3D-2"},{"id":"http://arxiv.org/abs/2501.12191v1","updated":"2025-01-21T14:56:47Z","published":"2025-01-21T14:56:47Z","title":"A margin-based replacement for cross-entropy loss","summary":" Cross-entropy (CE) loss is the de-facto standard for training deep neural\nnetworks to perform classification. However, CE-trained deep neural networks\nstruggle with robustness and generalisation issues. To alleviate these issues,\nwe propose high error margin (HEM) loss, a variant of multi-class margin loss\nthat overcomes the training issues of other margin-based losses. We evaluate\nHEM extensively on a range of architectures and datasets. We find that HEM loss\nis more effective than cross-entropy loss across a wide range of tasks: unknown\nclass rejection, adversarial robustness, learning with imbalanced data,\ncontinual learning, and semantic segmentation (a pixel-level classification\ntask). Despite all training hyper-parameters being chosen for CE loss, HEM is\ninferior to CE only in terms of clean accuracy and this difference is\ninsignificant. We also compare HEM to specialised losses that have previously\nbeen proposed to improve performance on specific tasks. LogitNorm, a loss\nachieving state-of-the-art performance on unknown class rejection, produces\nsimilar performance to HEM for this task, but is much poorer for continual\nlearning and semantic segmentation. Logit-adjusted loss, designed for\nimbalanced data, has superior results to HEM for that task, but performs more\npoorly on unknown class rejection and semantic segmentation. DICE, a popular\nloss for semantic segmentation, is inferior to HEM loss on all tasks, including\nsemantic segmentation. Thus, HEM often out-performs specialised losses, and in\ncontrast to them, is a general-purpose replacement for CE loss.\n","authors":["Michael W. Spratling","Heiko H. Schütt"],"pdf_url":"https://arxiv.org/pdf/2501.12191v1.pdf","comment":"Code: https://codeberg.org/mwspratling/HEMLoss"},{"id":"http://arxiv.org/abs/2403.02302v4","updated":"2025-01-21T14:50:25Z","published":"2024-03-04T18:32:12Z","title":"Beyond Specialization: Assessing the Capabilities of MLLMs in Age and\n Gender Estimation","summary":" Multimodal Large Language Models (MLLMs) have recently gained immense\npopularity. Powerful commercial models like ChatGPT-4V and Gemini, as well as\nopen-source ones such as LLaVA, are essentially general-purpose models and are\napplied to solve a wide variety of tasks, including those in computer vision.\nThese neural networks possess such strong general knowledge and reasoning\nabilities that they have proven capable of working even on tasks for which they\nwere not specifically trained. We compared the capabilities of the most\npowerful MLLMs to date: ShareGPT4V, ChatGPT, LLaVA-Next in a specialized task\nof age and gender estimation with our state-of-the-art specialized model,\nMiVOLO. We also updated MiVOLO and provide details and new metrics in this\narticle. This comparison has yielded some interesting results and insights\nabout the strengths and weaknesses of the participating models. Furthermore, we\nattempted various ways to fine-tune the ShareGPT4V model for this specific\ntask, aiming to achieve state-of-the-art results in this particular challenge.\nAlthough such a model would not be practical in production, as it is incredibly\nexpensive compared to a specialized model like MiVOLO, it could be very useful\nin some tasks, like data annotation.\n","authors":["Maksim Kuprashevich","Grigorii Alekseenko","Irina Tolstykh"],"pdf_url":"https://arxiv.org/pdf/2403.02302v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11303v3","updated":"2025-01-21T14:40:56Z","published":"2024-02-17T15:03:25Z","title":"FViT: A Focal Vision Transformer with Gabor Filter","summary":" Vision transformers have achieved encouraging progress in various computer\nvision tasks. A common belief is that this is attributed to the capability of\nself-attention in modeling the global dependencies among feature tokens.\nHowever, self-attention still faces several challenges in dense prediction\ntasks, including high computational complexity and absence of desirable\ninductive bias. To alleviate these issues, the potential advantages of\ncombining vision transformers with Gabor filters are revisited, and a learnable\nGabor filter (LGF) using convolution is proposed. The LGF does not rely on\nself-attention, and it is used to simulate the response of fundamental cells in\nthe biological visual system to the input images. This encourages vision\ntransformers to focus on discriminative feature representations of targets\nacross different scales and orientations. In addition, a Bionic Focal Vision\n(BFV) block is designed based on the LGF. This block draws inspiration from\nneuroscience and introduces a Dual-Path Feed Forward Network (DPFFN) to emulate\nthe parallel and cascaded information processing scheme of the biological\nvisual cortex. Furthermore, a unified and efficient family of pyramid backbone\nnetworks called Focal Vision Transformers (FViTs) is developed by stacking BFV\nblocks. Experimental results indicate that FViTs demonstrate superior\nperformance in various vision tasks. In terms of computational efficiency and\nscalability, FViTs show significant advantages compared with other\ncounterparts.\n","authors":["Yulong Shi","Mingwei Sun","Yongshuai Wang","Zengqiang Chen"],"pdf_url":"https://arxiv.org/pdf/2402.11303v3.pdf","comment":"This work has been submitted to Elsevier for possible publication"},{"id":"http://arxiv.org/abs/2501.12178v1","updated":"2025-01-21T14:35:35Z","published":"2025-01-21T14:35:35Z","title":"High-dimensional multimodal uncertainty estimation by manifold\n alignment:Application to 3D right ventricular strain computations","summary":" Confidence in the results is a key ingredient to improve the adoption of\nmachine learning methods by clinicians. Uncertainties on the results have been\nconsidered in the literature, but mostly those originating from the learning\nand processing methods. Uncertainty on the data is hardly challenged, as a\nsingle sample is often considered representative enough of each subject\nincluded in the analysis. In this paper, we propose a representation learning\nstrategy to estimate local uncertainties on a physiological descriptor (here,\nmyocardial deformation) previously obtained from medical images by different\ndefinitions or computations. We first use manifold alignment to match the\nlatent representations associated to different high-dimensional input\ndescriptors. Then, we formulate plausible distributions of latent\nuncertainties, and finally exploit them to reconstruct uncertainties on the\ninput high-dimensional descriptors. We demonstrate its relevance for the\nquantification of myocardial deformation (strain) from 3D echocardiographic\nimage sequences of the right ventricle, for which a lack of consensus exists in\nits definition and which directional component to use. We used a database of\n100 control subjects with right ventricle overload, for which different types\nof strain are available at each point of the right ventricle endocardial\nsurface mesh. Our approach quantifies local uncertainties on myocardial\ndeformation from different descriptors defining this physiological concept.\nSuch uncertainties cannot be directly estimated by local statistics on such\ndescriptors, potentially of heterogeneous types. Beyond this controlled\nillustrative application, our methodology has the potential to be generalized\nto many other population analyses considering heterogeneous high-dimensional\ndescriptors.\n","authors":["Maxime Di Folco","Gabriel Bernardino","Patrick Clarysse","Nicolas Duchateau"],"pdf_url":"https://arxiv.org/pdf/2501.12178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12173v1","updated":"2025-01-21T14:32:47Z","published":"2025-01-21T14:32:47Z","title":"ComposeAnyone: Controllable Layout-to-Human Generation with Decoupled\n Multimodal Conditions","summary":" Building on the success of diffusion models, significant advancements have\nbeen made in multimodal image generation tasks. Among these, human image\ngeneration has emerged as a promising technique, offering the potential to\nrevolutionize the fashion design process. However, existing methods often focus\nsolely on text-to-image or image reference-based human generation, which fails\nto satisfy the increasingly sophisticated demands. To address the limitations\nof flexibility and precision in human generation, we introduce ComposeAnyone, a\ncontrollable layout-to-human generation method with decoupled multimodal\nconditions. Specifically, our method allows decoupled control of any part in\nhand-drawn human layouts using text or reference images, seamlessly integrating\nthem during the generation process. The hand-drawn layout, which utilizes\ncolor-blocked geometric shapes such as ellipses and rectangles, can be easily\ndrawn, offering a more flexible and accessible way to define spatial layouts.\nAdditionally, we introduce the ComposeHuman dataset, which provides decoupled\ntext and reference image annotations for different components of each human\nimage, enabling broader applications in human image generation tasks. Extensive\nexperiments on multiple datasets demonstrate that ComposeAnyone generates human\nimages with better alignment to given layouts, text descriptions, and reference\nimages, showcasing its multi-task capability and controllability.\n","authors":["Shiyue Zhang","Zheng Chong","Xi Lu","Wenqing Zhang","Haoxiang Li","Xujie Zhang","Jiehui Huang","Xiao Dong","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2501.12173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12169v1","updated":"2025-01-21T14:29:27Z","published":"2025-01-21T14:29:27Z","title":"SVGS-DSGAT: An IoT-Enabled Innovation in Underwater Robotic Object\n Detection Technology","summary":" With the advancement of Internet of Things (IoT) technology, underwater\ntarget detection and tracking have become increasingly important for ocean\nmonitoring and resource management. Existing methods often fall short in\nhandling high-noise and low-contrast images in complex underwater environments,\nlacking precision and robustness. This paper introduces a novel SVGS-DSGAT\nmodel that combines GraphSage, SVAM, and DSGAT modules, enhancing feature\nextraction and target detection capabilities through graph neural networks and\nattention mechanisms. The model integrates IoT technology to facilitate\nreal-time data collection and processing, optimizing resource allocation and\nmodel responsiveness. Experimental results demonstrate that the SVGS-DSGAT\nmodel achieves an mAP of 40.8% on the URPC 2020 dataset and 41.5% on the\nSeaDronesSee dataset, significantly outperforming existing mainstream models.\nThis IoT-enhanced approach not only excels in high-noise and complex\nbackgrounds but also improves the overall efficiency and scalability of the\nsystem. This research provides an effective IoT solution for underwater target\ndetection technology, offering significant practical application value and\nbroad development prospects.\n","authors":["Dongli Wu","Ling Luo"],"pdf_url":"https://arxiv.org/pdf/2501.12169v1.pdf","comment":"17 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.12157v1","updated":"2025-01-21T14:09:58Z","published":"2025-01-21T14:09:58Z","title":"Fast-RF-Shimming: Accelerate RF Shimming in 7T MRI using Deep Learning","summary":" Ultrahigh field (UHF) Magnetic Resonance Imaging (MRI) provides a high\nsignal-to-noise ratio (SNR), enabling exceptional spatial resolution for\nclinical diagnostics and research. However, higher fields introduce challenges\nsuch as transmit radiofrequency (RF) field inhomogeneities, which result in\nuneven flip angles and image intensity artifacts. These artifacts degrade image\nquality and limit clinical adoption. Traditional RF shimming methods, including\nMagnitude Least Squares (MLS) optimization, mitigate RF field inhomogeneity but\nare time-intensive and often require the presence of the patient. Recent\nmachine learning methods, such as RF Shim Prediction by Iteratively Projected\nRidge Regression and other deep learning architectures, offer alternative\napproaches but face challenges such as extensive training requirements, limited\ncomplexity, and practical data constraints. This paper introduces a holistic\nlearning-based framework called Fast RF Shimming, which achieves a 5000-fold\nspeedup compared to MLS methods. First, random-initialized Adaptive Moment\nEstimation (Adam) derives reference shimming weights from multichannel RF\nfields. Next, a Residual Network (ResNet) maps RF fields to shimming outputs\nwhile incorporating a confidence parameter into the loss function. Finally, a\nNon-uniformity Field Detector (NFD) identifies extreme non-uniform outcomes.\nComparative evaluations demonstrate significant improvements in both speed and\npredictive accuracy. The proposed pipeline also supports potential extensions,\nsuch as the integration of anatomical priors or multi-echo data, to enhance the\nrobustness of RF field correction. This approach offers a faster and more\nefficient solution to RF shimming challenges in UHF MRI.\n","authors":["Zhengyi Lu","Hao Liang","Ming Lu","Xiao Wang","Xinqiang Yan","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2501.12157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12150v1","updated":"2025-01-21T14:01:10Z","published":"2025-01-21T14:01:10Z","title":"DNRSelect: Active Best View Selection for Deferred Neural Rendering","summary":" Deferred neural rendering (DNR) is an emerging computer graphics pipeline\ndesigned for high-fidelity rendering and robotic perception. However, DNR\nheavily relies on datasets composed of numerous ray-traced images and demands\nsubstantial computational resources. It remains under-explored how to reduce\nthe reliance on high-quality ray-traced images while maintaining the rendering\nfidelity. In this paper, we propose DNRSelect, which integrates a reinforcement\nlearning-based view selector and a 3D texture aggregator for deferred neural\nrendering. We first propose a novel view selector for deferred neural rendering\nbased on reinforcement learning, which is trained on easily obtained rasterized\nimages to identify the optimal views. By acquiring only a few ray-traced images\nfor these selected views, the selector enables DNR to achieve high-quality\nrendering. To further enhance spatial awareness and geometric consistency in\nDNR, we introduce a 3D texture aggregator that fuses pyramid features from\ndepth maps and normal maps with UV maps. Given that acquiring ray-traced images\nis more time-consuming than generating rasterized images, DNRSelect minimizes\nthe need for ray-traced data by using only a few selected views while still\nachieving high-fidelity rendering results. We conduct detailed experiments and\nablation studies on the NeRF-Synthetic dataset to demonstrate the effectiveness\nof DNRSelect. The code will be released.\n","authors":["Dongli Wu","Haochen Li","Xiaobao Wei"],"pdf_url":"https://arxiv.org/pdf/2501.12150v1.pdf","comment":"7 pages, 8 figures, submitted to ICRA 2025"},{"id":"http://arxiv.org/abs/2501.12119v1","updated":"2025-01-21T13:30:16Z","published":"2025-01-21T13:30:16Z","title":"ENTIRE: Learning-based Volume Rendering Time Prediction","summary":" We present ENTIRE, a novel approach for volume rendering time prediction.\nTime-dependent volume data from simulations or experiments typically comprise\ncomplex deforming structures across hundreds or thousands of time steps, which\nin addition to the camera configuration has a significant impact on rendering\nperformance. We first extract a feature vector from a volume that captures its\nstructure that is relevant for rendering time performance. Then we combine this\nfeature vector with further relevant parameters (e.g. camera setup), and with\nthis perform the final prediction. Our experiments conducted on various\ndatasets demonstrate that our model is capable of efficiently achieving high\nprediction accuracy with fast response rates. We showcase ENTIRE's capability\nof enabling dynamic parameter adaptation for stable frame rates and load\nbalancing in two case studies.\n","authors":["Zikai Yin","Hamid Gadirov","Jiri Kosinka","Steffen Frey"],"pdf_url":"https://arxiv.org/pdf/2501.12119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12115v1","updated":"2025-01-21T13:25:32Z","published":"2025-01-21T13:25:32Z","title":"Meta-Sparsity: Learning Optimal Sparse Structures in Multi-task Networks\n through Meta-learning","summary":" This paper presents meta-sparsity, a framework for learning model sparsity,\nbasically learning the parameter that controls the degree of sparsity, that\nallows deep neural networks (DNNs) to inherently generate optimal sparse shared\nstructures in multi-task learning (MTL) setting. This proposed approach enables\nthe dynamic learning of sparsity patterns across a variety of tasks, unlike\ntraditional sparsity methods that rely heavily on manual hyperparameter tuning.\nInspired by Model Agnostic Meta-Learning (MAML), the emphasis is on learning\nshared and optimally sparse parameters in multi-task scenarios by implementing\na penalty-based, channel-wise structured sparsity during the meta-training\nphase. This method improves the model's efficacy by removing unnecessary\nparameters and enhances its ability to handle both seen and previously unseen\ntasks. The effectiveness of meta-sparsity is rigorously evaluated by extensive\nexperiments on two datasets, NYU-v2 and CelebAMask-HQ, covering a broad\nspectrum of tasks ranging from pixel-level to image-level predictions. The\nresults show that the proposed approach performs well across many tasks,\nindicating its potential as a versatile tool for creating efficient and\nadaptable sparse neural networks. This work, therefore, presents an approach\ntowards learning sparsity, contributing to the efforts in the field of sparse\nneural networks and suggesting new directions for research towards parsimonious\nmodels.\n","authors":["Richa Upadhyay","Ronald Phlypo","Rajkumar Saini","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2501.12115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05693v2","updated":"2025-01-21T13:15:29Z","published":"2024-04-08T17:18:30Z","title":"Evaluating the Efficacy of Cut-and-Paste Data Augmentation in Semantic\n Segmentation for Satellite Imagery","summary":" Satellite imagery is crucial for tasks like environmental monitoring and\nurban planning. Typically, it relies on semantic segmentation or Land Use Land\nCover (LULC) classification to categorize each pixel. Despite the advancements\nbrought about by Deep Neural Networks (DNNs), their performance in segmentation\ntasks is hindered by challenges such as limited availability of labeled data,\nclass imbalance and the inherent variability and complexity of satellite\nimages. In order to mitigate those issues, our study explores the effectiveness\nof a Cut-and-Paste augmentation technique for semantic segmentation in\nsatellite images. We adapt this augmentation, which usually requires labeled\ninstances, to the case of semantic segmentation. By leveraging the connected\ncomponents in the semantic segmentation labels, we extract instances that are\nthen randomly pasted during training. Using the DynamicEarthNet dataset and a\nU-Net model for evaluation, we found that this augmentation significantly\nenhances the mIoU score on the test set from 37.9 to 44.1. This finding\nhighlights the potential of the Cut-and-Paste augmentation to improve the\ngeneralization capabilities of semantic segmentation models in satellite\nimagery.\n","authors":["Ionut M. Motoi","Leonardo Saraceni","Daniele Nardi","Thomas A. Ciarfuglia"],"pdf_url":"https://arxiv.org/pdf/2404.05693v2.pdf","comment":"Published in: IGARSS 2024 - 2024 IEEE International Geoscience and\n Remote Sensing Symposium"},{"id":"http://arxiv.org/abs/2501.12104v1","updated":"2025-01-21T12:55:04Z","published":"2025-01-21T12:55:04Z","title":"Teacher Encoder-Student Decoder Denoising Guided Segmentation Network\n for Anomaly Detection","summary":" Visual anomaly detection is a highly challenging task, often categorized as a\none-class classification and segmentation problem. Recent studies have\ndemonstrated that the student-teacher (S-T) framework effectively addresses\nthis challenge. However, most S-T frameworks rely solely on pre-trained teacher\nnetworks to guide student networks in learning multi-scale similar features,\noverlooking the potential of the student networks to enhance learning through\nmulti-scale feature fusion. In this study, we propose a novel model named\nPFADSeg, which integrates a pre-trained teacher network, a denoising student\nnetwork with multi-scale feature fusion, and a guided anomaly segmentation\nnetwork into a unified framework. By adopting a unique teacher-encoder and\nstudent-decoder denoising mode, the model improves the student network's\nability to learn from teacher network features. Furthermore, an adaptive\nfeature fusion mechanism is introduced to train a self-supervised segmentation\nnetwork that synthesizes anomaly masks autonomously, significantly increasing\ndetection performance. Evaluated on the MVTec AD dataset, PFADSeg achieves\nstate-of-the-art results with an image-level AUC of 98.9%, a pixel-level mean\nprecision of 76.4%, and an instance-level mean precision of 78.7%.\n","authors":["ShiXuan Song","Hao Chen","Shu Hu","Xin Wang","Jinrong Hu","Xi Wu"],"pdf_url":"https://arxiv.org/pdf/2501.12104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12102v1","updated":"2025-01-21T12:49:30Z","published":"2025-01-21T12:49:30Z","title":"Proxies for Distortion and Consistency with Applications for Real-World\n Image Restoration","summary":" Real-world image restoration deals with the recovery of images suffering from\nan unknown degradation. This task is typically addressed while being given only\ndegraded images, without their corresponding ground-truth versions. In this\nhard setting, designing and evaluating restoration algorithms becomes highly\nchallenging. This paper offers a suite of tools that can serve both the design\nand assessment of real-world image restoration algorithms. Our work starts by\nproposing a trained model that predicts the chain of degradations a given\nreal-world measured input has gone through. We show how this estimator can be\nused to approximate the consistency -- the match between the measurements and\nany proposed recovered image. We also use this estimator as a guiding force for\nthe design of a simple and highly-effective plug-and-play real-world image\nrestoration algorithm, leveraging a pre-trained diffusion-based image prior.\nFurthermore, this work proposes no-reference proxy measures of MSE and LPIPS,\nwhich, without access to the ground-truth images, allow ranking of real-world\nimage restoration algorithms according to their (approximate) MSE and LPIPS.\nThe proposed suite provides a versatile, first of its kind framework for\nevaluating and comparing blind image restoration algorithms in real-world\nscenarios.\n","authors":["Sean Man","Guy Ohayon","Ron Raphaeli","Michael Elad"],"pdf_url":"https://arxiv.org/pdf/2501.12102v1.pdf","comment":"Project page in https://man-sean.github.io/elad-website/"},{"id":"http://arxiv.org/abs/2412.08344v3","updated":"2025-01-21T12:30:57Z","published":"2024-12-11T12:34:37Z","title":"CoDTS: Enhancing Sparsely Supervised Collaborative Perception with a\n Dual Teacher-Student Framework","summary":" Current collaborative perception methods often rely on fully annotated\ndatasets, which can be expensive to obtain in practical situations. To reduce\nannotation costs, some works adopt sparsely supervised learning techniques and\ngenerate pseudo labels for the missing instances. However, these methods fail\nto achieve an optimal confidence threshold that harmonizes the quality and\nquantity of pseudo labels. To address this issue, we propose an end-to-end\nCollaborative perception Dual Teacher-Student framework (CoDTS), which employs\nadaptive complementary learning to produce both high-quality and high-quantity\npseudo labels. Specifically, the Main Foreground Mining (MFM) module generates\nhigh-quality pseudo labels based on the prediction of the static teacher.\nSubsequently, the Supplement Foreground Mining (SFM) module ensures a balance\nbetween the quality and quantity of pseudo labels by adaptively identifying\nmissing instances based on the prediction of the dynamic teacher. Additionally,\nthe Neighbor Anchor Sampling (NAS) module is incorporated to enhance the\nrepresentation of pseudo labels. To promote the adaptive complementary\nlearning, we implement a staged training strategy that trains the student and\ndynamic teacher in a mutually beneficial manner. Extensive experiments\ndemonstrate that the CoDTS effectively ensures an optimal balance of pseudo\nlabels in both quality and quantity, establishing a new state-of-the-art in\nsparsely supervised collaborative perception.\n","authors":["Yushan Han","Hui Zhang","Honglei Zhang","Jing Wang","Yidong Li"],"pdf_url":"https://arxiv.org/pdf/2412.08344v3.pdf","comment":"AAAI 2025 (Oral)"},{"id":"http://arxiv.org/abs/2501.12087v1","updated":"2025-01-21T12:29:45Z","published":"2025-01-21T12:29:45Z","title":"UAV-Assisted Real-Time Disaster Detection Using Optimized Transformer\n Model","summary":" Disaster recovery and management present significant challenges, particularly\nin unstable environments and hard-to-reach terrains. These difficulties can be\novercome by employing unmanned aerial vehicles (UAVs) equipped with onboard\nembedded platforms and camera sensors. In this work, we address the critical\nneed for accurate and timely disaster detection by enabling onboard aerial\nimagery processing and avoiding connectivity, privacy, and latency issues\ndespite the challenges posed by limited onboard hardware resources. We propose\na UAV-assisted edge framework for real-time disaster management, leveraging our\nproposed model optimized for real-time aerial image classification. The\noptimization of the model employs post-training quantization techniques. For\nreal-world disaster scenarios, we introduce a novel dataset, DisasterEye,\nfeaturing UAV-captured disaster scenes as well as ground-level images taken by\nindividuals on-site. Experimental results demonstrate the effectiveness of our\nmodel, achieving high accuracy with reduced inference latency and memory usage\non resource-constrained devices. The framework's scalability and adaptability\nmake it a robust solution for real-time disaster detection on resource-limited\nUAV platforms.\n","authors":["Branislava Jankovic","Sabina Jangirova","Waseem Ullah","Latif U. Khan","Mohsen Guizani"],"pdf_url":"https://arxiv.org/pdf/2501.12087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12086v1","updated":"2025-01-21T12:28:36Z","published":"2025-01-21T12:28:36Z","title":"DSTSA-GCN: Advancing Skeleton-Based Gesture Recognition with\n Semantic-Aware Spatio-Temporal Topology Modeling","summary":" Graph convolutional networks (GCNs) have emerged as a powerful tool for\nskeleton-based action and gesture recognition, thanks to their ability to model\nspatial and temporal dependencies in skeleton data. However, existing GCN-based\nmethods face critical limitations: (1) they lack effective spatio-temporal\ntopology modeling that captures dynamic variations in skeletal motion, and (2)\nthey struggle to model multiscale structural relationships beyond local joint\nconnectivity. To address these issues, we propose a novel framework called\nDynamic Spatial-Temporal Semantic Awareness Graph Convolutional Network\n(DSTSA-GCN). DSTSA-GCN introduces three key modules: Group Channel-wise Graph\nConvolution (GC-GC), Group Temporal-wise Graph Convolution (GT-GC), and\nMulti-Scale Temporal Convolution (MS-TCN). GC-GC and GT-GC operate in parallel\nto independently model channel-specific and frame-specific correlations,\nenabling robust topology learning that accounts for temporal variations.\nAdditionally, both modules employ a grouping strategy to adaptively capture\nmultiscale structural relationships. Complementing this, MS-TCN enhances\ntemporal modeling through group-wise temporal convolutions with diverse\nreceptive fields. Extensive experiments demonstrate that DSTSA-GCN\nsignificantly improves the topology modeling capabilities of GCNs, achieving\nstate-of-the-art performance on benchmark datasets for gesture and action\nrecognition, including SHREC17 Track, DHG-14\\/28, NTU-RGB+D, and NTU-RGB+D-120.\n","authors":["Hu Cui","Renjing Huang","Ruoyu Zhang","Tessai Hayama"],"pdf_url":"https://arxiv.org/pdf/2501.12086v1.pdf","comment":"submit to Neurocomputing"},{"id":"http://arxiv.org/abs/2501.12085v1","updated":"2025-01-21T12:22:15Z","published":"2025-01-21T12:22:15Z","title":"Scalable Whole Slide Image Representation Using K-Mean Clustering and\n Fisher Vector Aggregation","summary":" Whole slide images (WSIs) are high-resolution, gigapixel sized images that\npose significant computational challenges for traditional machine learning\nmodels due to their size and heterogeneity.In this paper, we present a scalable\nand efficient methodology for WSI classification by leveraging patch-based\nfeature extraction, clustering, and Fisher vector encoding. Initially, WSIs are\ndivided into fixed size patches, and deep feature embeddings are extracted from\neach patch using a pre-trained convolutional neural network (CNN). These\npatch-level embeddings are subsequently clustered using K-means clustering,\nwhere each cluster aggregates semantically similar regions of the WSI. To\neffectively summarize each cluster, Fisher vector representations are computed\nby modeling the distribution of patch embeddings in each cluster as a\nparametric Gaussian mixture model (GMM). The Fisher vectors from each cluster\nare concatenated into a high-dimensional feature vector, creating a compact and\ninformative representation of the entire WSI. This feature vector is then used\nby a classifier to predict the WSI's diagnostic label. Our method captures\nlocal and global tissue structures and yields robust performance for\nlarge-scale WSI classification, demonstrating superior accuracy and scalability\ncompared to other approaches.\n","authors":["Ravi Kant Gupta","Shounak Das","Ardhendu Sekhar","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2501.12085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12082v1","updated":"2025-01-21T12:15:16Z","published":"2025-01-21T12:15:16Z","title":"A Multi-annotated and Multi-modal Dataset for Wide-angle Video Quality\n Assessment","summary":" Wide-angle video is favored for its wide viewing angle and ability to capture\na large area of scenery, making it an ideal choice for sports and adventure\nrecording. However, wide-angle video is prone to deformation, exposure and\nother distortions, resulting in poor video quality and affecting the perception\nand experience, which may seriously hinder its application in fields such as\ncompetitive sports. Up to now, few explorations focus on the quality assessment\nissue of wide-angle video. This deficiency primarily stems from the absence of\na specialized dataset for wide-angle videos. To bridge this gap, we construct\nthe first Multi-annotated and multi-modal Wide-angle Video quality assessment\n(MWV) dataset. Then, the performances of state-of-the-art video quality methods\non the MWV dataset are investigated by inter-dataset testing and intra-dataset\ntesting. Experimental results show that these methods impose significant\nlimitations on their applicability.\n","authors":["Bo Hu","Wei Wang","Chunyi Li","Lihuo He","Leida Li","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2501.12082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12073v1","updated":"2025-01-21T11:59:07Z","published":"2025-01-21T11:59:07Z","title":"Towards autonomous photogrammetric forest inventory using a lightweight\n under-canopy robotic drone","summary":" Drones are increasingly used in forestry to capture high-resolution remote\nsensing data. While operations above the forest canopy are already highly\nautomated, flying inside forests remains challenging, primarily relying on\nmanual piloting. Inside dense forests, reliance on the Global Navigation\nSatellite System (GNSS) for localization is not feasible. Additionally, the\ndrone must autonomously adjust its flight path to avoid collisions. Recently,\nadvancements in robotics have enabled autonomous drone flights in GNSS-denied\nobstacle-rich areas. In this article, a step towards autonomous forest data\ncollection is taken by building a prototype of a robotic under-canopy drone\nutilizing state-of-the-art open-source methods and validating its performance\nfor data collection inside forests. The autonomous flight capability was\nevaluated through multiple test flights in two boreal forest test sites. The\ntree parameter estimation capability was studied by conducting diameter at\nbreast height (DBH) estimation using onboard stereo camera data and\nphotogrammetric methods. The prototype conducted flights in selected\nchallenging forest environments, and the experiments showed excellent\nperformance in forest reconstruction with a miniaturized stereoscopic\nphotogrammetric system. The stem detection algorithm managed to identify 79.31\n% of the stems. The DBH estimation had a root mean square error (RMSE) of 3.33\ncm (12.79 %) and a bias of 1.01 cm (3.87 %) across all trees. For trees with a\nDBH less than 30 cm, the RMSE was 1.16 cm (5.74 %), and the bias was 0.13 cm\n(0.64 %). When considering the overall performance in terms of DBH accuracy,\nautonomy, and forest complexity, the proposed approach was superior compared to\nmethods proposed in the scientific literature. Results provided valuable\ninsights into autonomous forest reconstruction using drones, and several\nfurther development topics were proposed.\n","authors":["Väinö Karjalainen","Niko Koivumäki","Teemu Hakala","Jesse Muhojoki","Eric Hyyppä","Anand George","Juha Suomalainen","Eija Honkavaara"],"pdf_url":"https://arxiv.org/pdf/2501.12073v1.pdf","comment":"35 pages, 13 Figures"},{"id":"http://arxiv.org/abs/2501.12071v1","updated":"2025-01-21T11:54:37Z","published":"2025-01-21T11:54:37Z","title":"Co-Paced Learning Strategy Based on Confidence for Flying Bird Object\n Detection Model Training","summary":" To mitigate the adverse effects of hard samples on the training of the Flying\nBird Object Detection (FBOD) model for surveillance videos, we propose a\nCo-Paced Learning Based on Confidence (CPL-BC) strategy and apply this strategy\nto the training process of the FBOD model. This strategy involves maintaining\ntwo models with identical structures but different initial parameter\nconfigurations, which collaborate with each other to select easy samples with\nprediction confidence exceeding a set threshold for training. As training\nprogresses, the strategy gradually lowers the threshold, allowing more samples\nto participate, enhancing the model's ability to recognize objects from easy to\nhard. Before applying the CPL-BC strategy to train the FBOD models, we\ninitially trained the two FBOD models to equip them with the capability to\nassess the difficulty level of flying bird object samples. Experimental results\non two different datasets of flying bird objects in surveillance videos\ndemonstrate that, compared to other model learning strategies, CPL-BC\nsignificantly improves detection accuracy, verifying the effectiveness and\nadvancement of this method.\n","authors":["Zi-Wei Sun","Ze-Xi Hua","Heng-Chao Li","Yan Li"],"pdf_url":"https://arxiv.org/pdf/2501.12071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12060v1","updated":"2025-01-21T11:30:51Z","published":"2025-01-21T11:30:51Z","title":"GaussianVideo: Efficient Video Representation Through 2D Gaussian\n Splatting","summary":" 3D Gaussian splats have emerged as a revolutionary, effective, learned\nrepresentation for static 3D scenes. In this work, we explore using 2D Gaussian\nsplats as a new primitive for representing videos. We propose GaussianVideo, an\napproach to learning a set of 2D Gaussian splats that can effectively represent\nvideo frames. GaussianVideo incorporates the following techniques: (i) To\nexploit temporal redundancy among adjacent frames, which can speed up training\nand improve the compression efficiency, we predict the Gaussian splats of a\nframe based on its previous frame; (ii) To control the trade-offs between file\nsize and quality, we remove Gaussian splats with low contribution to the video\nquality; (iii) To capture dynamics in videos, we randomly add Gaussian splats\nto fit content with large motion or newly-appeared objects; (iv) To handle\nsignificant changes in the scene, we detect key frames based on loss\ndifferences during the learning process. Experiment results show that\nGaussianVideo achieves good rate-distortion trade-offs, comparable to\nstate-of-the-art video codecs such as AV1 and VVC, and a rendering speed of\n1500 fps for a 1920x1080 video.\n","authors":["Longan Wang","Yuang Shi","Wei Tsang Ooi"],"pdf_url":"https://arxiv.org/pdf/2501.12060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12057v1","updated":"2025-01-21T11:27:54Z","published":"2025-01-21T11:27:54Z","title":"Unified 3D MRI Representations via Sequence-Invariant Contrastive\n Learning","summary":" Self-supervised deep learning has accelerated 2D natural image analysis but\nremains difficult to translate into 3D MRI, where data are scarce and\npre-trained 2D backbones cannot capture volumetric context. We present a\nsequence-invariant self-supervised framework leveraging quantitative MRI\n(qMRI). By simulating multiple MRI contrasts from a single 3D qMRI scan and\nenforcing consistent representations across these contrasts, we learn\nanatomy-centric rather than sequence-specific features. This yields a robust 3D\nencoder that performs strongly across varied tasks and protocols. Experiments\non healthy brain segmentation (IXI), stroke lesion segmentation (ARC), and MRI\ndenoising show significant gains over baseline SSL approaches, especially in\nlow-data settings (up to +8.3% Dice, +4.2 dB PSNR). Our model also generalises\neffectively to unseen sites, demonstrating potential for more scalable and\nclinically reliable volumetric analysis. All code and trained models are\npublicly available.\n","authors":["Liam Chalcroft","Jenny Cronin","Cathy J. Price","John Ashburner"],"pdf_url":"https://arxiv.org/pdf/2501.12057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12054v1","updated":"2025-01-21T11:26:02Z","published":"2025-01-21T11:26:02Z","title":"ORCAst: Operational High-Resolution Current Forecasts","summary":" We present ORCAst, a multi-stage, multi-arm network for Operational\nhigh-Resolution Current forecAsts over one week. Producing real-time nowcasts\nand forecasts of ocean surface currents is a challenging problem due to\nindirect or incomplete information from satellite remote sensing data. Entirely\ntrained on real satellite data and in situ measurements from drifters, our\nmodel learns to forecast global ocean surface currents using various sources of\nground truth observations in a multi-stage learning procedure. Our multi-arm\nencoder-decoder model architecture allows us to first predict sea surface\nheight and geostrophic currents from larger quantities of nadir and SWOT\naltimetry data, before learning to predict ocean surface currents from much\nmore sparse in situ measurements from drifters. Training our model on specific\nregions improves performance. Our model achieves stronger nowcast and forecast\nperformance in predicting ocean surface currents than various state-of-the-art\nmethods.\n","authors":["Pierre Garcia","Inès Larroche","Amélie Pesnec","Hannah Bull","Théo Archambault","Evangelos Moschos","Alexandre Stegner","Anastase Charantonis","Dominique Béréziat"],"pdf_url":"https://arxiv.org/pdf/2501.12054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12052v1","updated":"2025-01-21T11:25:44Z","published":"2025-01-21T11:25:44Z","title":"Aggrotech: Leveraging Deep Learning for Sustainable Tomato Disease\n Management","summary":" Tomato crop health plays a critical role in ensuring agricultural\nproductivity and food security. Timely and accurate detection of diseases\naffecting tomato plants is vital for effective disease management. In this\nstudy, we propose a deep learning-based approach for Tomato Leaf Disease\nDetection using two well-established convolutional neural networks (CNNs),\nnamely VGG19 and Inception v3. The experiment is conducted on the Tomato\nVillages Dataset, encompassing images of both healthy tomato leaves and leaves\nafflicted by various diseases. The VGG19 model is augmented with fully\nconnected layers, while the Inception v3 model is modified to incorporate a\nglobal average pooling layer and a dense classification layer. Both models are\ntrained on the prepared dataset, and their performances are evaluated on a\nseparate test set. This research employs VGG19 and Inception v3 models on the\nTomato Villages dataset (4525 images) for tomato leaf disease detection. The\nmodels' accuracy of 93.93% with dropout layers demonstrates their usefulness\nfor crop health monitoring. The paper suggests a deep learning-based strategy\nthat includes normalization, resizing, dataset preparation, and unique model\narchitectures. During training, VGG19 and Inception v3 serve as feature\nextractors, with possible data augmentation and fine-tuning. Metrics like\naccuracy, precision, recall, and F1 score are obtained through evaluation on a\ntest set and offer important insights into the strengths and shortcomings of\nthe model. The method has the potential for practical use in precision\nagriculture and could help tomato crops prevent illness early on.\n","authors":["MD Mehraz Hosen","Md. Hasibul Islam"],"pdf_url":"https://arxiv.org/pdf/2501.12052v1.pdf","comment":"10 pages, 6 figures, ROC curves, confusion matrix analysis, and\n classification reports"},{"id":"http://arxiv.org/abs/2501.12048v1","updated":"2025-01-21T11:21:16Z","published":"2025-01-21T11:21:16Z","title":"Adaptive Class Learning to Screen Diabetic Disorders in Fundus Images of\n Eye","summary":" The prevalence of ocular illnesses is growing globally, presenting a\nsubstantial public health challenge. Early detection and timely intervention\nare crucial for averting visual impairment and enhancing patient prognosis.\nThis research introduces a new framework called Class Extension with Limited\nData (CELD) to train a classifier to categorize retinal fundus images. The\nclassifier is initially trained to identify relevant features concerning\nHealthy and Diabetic Retinopathy (DR) classes and later fine-tuned to adapt to\nthe task of classifying the input images into three classes: Healthy, DR, and\nGlaucoma. This strategy allows the model to gradually enhance its\nclassification capabilities, which is beneficial in situations where there are\nonly a limited number of labeled datasets available. Perturbation methods are\nalso used to identify the input image characteristics responsible for\ninfluencing the models decision-making process. We achieve an overall accuracy\nof 91% on publicly available datasets.\n","authors":["Shramana Dey","Pallabi Dutta","Riddhasree Bhattacharyya","Surochita Pal","Sushmita Mitra","Rajiv Raman"],"pdf_url":"https://arxiv.org/pdf/2501.12048v1.pdf","comment":"Accepted at International Conference on Pattern Recognition (ICPR)\n 2024"},{"id":"http://arxiv.org/abs/2501.12030v1","updated":"2025-01-21T10:48:13Z","published":"2025-01-21T10:48:13Z","title":"Advancing Earth Observation: A Survey on AI-Powered Image Processing in\n Satellites","summary":" Advancements in technology and reduction in it's cost have led to a\nsubstantial growth in the quality & quantity of imagery captured by Earth\nObservation (EO) satellites. This has presented a challenge to the efficacy of\nthe traditional workflow of transmitting this imagery to Earth for processing.\nAn approach to addressing this issue is to use pre-trained artificial\nintelligence models to process images on-board the satellite, but this is\ndifficult given the constraints within a satellite's environment. This paper\nprovides an up-to-date and thorough review of research related to image\nprocessing on-board Earth observation satellites. The significant constraints\nare detailed along with the latest strategies to mitigate them.\n","authors":["Aidan Duggan","Bruno Andrade","Haithem Afli"],"pdf_url":"https://arxiv.org/pdf/2501.12030v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.12023v1","updated":"2025-01-21T10:33:19Z","published":"2025-01-21T10:33:19Z","title":"Comparative Analysis of Pre-trained Deep Learning Models and DINOv2 for\n Cushing's Syndrome Diagnosis in Facial Analysis","summary":" Cushing's syndrome is a condition caused by excessive glucocorticoid\nsecretion from the adrenal cortex, often manifesting with moon facies and\nplethora, making facial data crucial for diagnosis. Previous studies have used\npre-trained convolutional neural networks (CNNs) for diagnosing Cushing's\nsyndrome using frontal facial images. However, CNNs are better at capturing\nlocal features, while Cushing's syndrome often presents with global facial\nfeatures. Transformer-based models like ViT and SWIN, which utilize\nself-attention mechanisms, can better capture long-range dependencies and\nglobal features. Recently, DINOv2, a foundation model based on visual\nTransformers, has gained interest. This study compares the performance of\nvarious pre-trained models, including CNNs, Transformer-based models, and\nDINOv2, in diagnosing Cushing's syndrome. We also analyze gender bias and the\nimpact of freezing mechanisms on DINOv2. Our results show that\nTransformer-based models and DINOv2 outperformed CNNs, with ViT achieving the\nhighest F1 score of 85.74%. Both the pre-trained model and DINOv2 had higher\naccuracy for female samples. DINOv2 also showed improved performance when\nfreezing parameters. In conclusion, Transformer-based models and DINOv2 are\neffective for Cushing's syndrome classification.\n","authors":["Hongjun Liu","Changwei Song","Jiaqi Qiang","Jianqiang Li","Hui Pan","Lin Lu","Xiao Long","Qing Zhao","Jiuzuo Huang","Shi Chen"],"pdf_url":"https://arxiv.org/pdf/2501.12023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12022v1","updated":"2025-01-21T10:25:09Z","published":"2025-01-21T10:25:09Z","title":"Foreign object segmentation in chest x-rays through anatomy-guided shape\n insertion","summary":" In this paper, we tackle the challenge of instance segmentation for foreign\nobjects in chest radiographs, commonly seen in postoperative follow-ups with\nstents, pacemakers, or ingested objects in children. The diversity of foreign\nobjects complicates dense annotation, as shown in insufficient existing\ndatasets. To address this, we propose the simple generation of synthetic data\nthrough (1) insertion of arbitrary shapes (lines, polygons, ellipses) with\nvarying contrasts and opacities, and (2) cut-paste augmentations from a small\nset of semi-automatically extracted labels. These insertions are guided by\nanatomy labels to ensure realistic placements, such as stents appearing only in\nrelevant vessels. Our approach enables networks to segment complex structures\nwith minimal manually labeled data. Notably, it achieves performance comparable\nto fully supervised models while using 93\\% fewer manual annotations.\n","authors":["Constantin Seibold","Hamza Kalisch","Lukas Heine","Simon Reiß","Jens Kleesiek"],"pdf_url":"https://arxiv.org/pdf/2501.12022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12020v1","updated":"2025-01-21T10:21:19Z","published":"2025-01-21T10:21:19Z","title":"On the \"Illusion\" of Gender Bias in Face Recognition: Explaining the\n Fairness Issue Through Non-demographic Attributes","summary":" Face recognition systems (FRS) exhibit significant accuracy differences based\non the user's gender. Since such a gender gap reduces the trustworthiness of\nFRS, more recent efforts have tried to find the causes. However, these studies\nmake use of manually selected, correlated, and small-sized sets of facial\nfeatures to support their claims. In this work, we analyse gender bias in face\nrecognition by successfully extending the search domain to decorrelated\ncombinations of 40 non-demographic facial characteristics. First, we propose a\ntoolchain to effectively decorrelate and aggregate facial attributes to enable\na less-biased gender analysis on large-scale data. Second, we introduce two new\nfairness metrics to measure fairness with and without context. Based on these\ngrounds, we thirdly present a novel unsupervised algorithm able to reliably\nidentify attribute combinations that lead to vanishing bias when used as filter\npredicates for balanced testing datasets. The experiments show that the gender\ngap vanishes when images of male and female subjects share specific attributes,\nclearly indicating that the issue is not a question of biology but of the\nsocial definition of appearance. These findings could reshape our understanding\nof fairness in face biometrics and provide insights into FRS, helping to\naddress gender bias issues.\n","authors":["Paul Jonas Kurz","Haiyu Wu","Kevin W. Bowyer","Philipp Terhörst"],"pdf_url":"https://arxiv.org/pdf/2501.12020v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12016v1","updated":"2025-01-21T10:16:00Z","published":"2025-01-21T10:16:00Z","title":"Are Traditional Deep Learning Model Approaches as Effective as a\n Retinal-Specific Foundation Model for Ocular and Systemic Disease Detection?","summary":" Background: RETFound, a self-supervised, retina-specific foundation model\n(FM), showed potential in downstream applications. However, its comparative\nperformance with traditional deep learning (DL) models remains incompletely\nunderstood. This study aimed to evaluate RETFound against three\nImageNet-pretrained supervised DL models (ResNet50, ViT-base, SwinV2) in\ndetecting ocular and systemic diseases.\n Methods: We fine-tuned/trained RETFound and three DL models on full datasets,\n50%, 20%, and fixed sample sizes (400, 200, 100 images, with half comprising\ndisease cases; for each DR severity class, 100 and 50 cases were used.\nFine-tuned models were tested internally using the SEED (53,090 images) and\nAPTOS-2019 (3,672 images) datasets and externally validated on population-based\n(BES, CIEMS, SP2, UKBB) and open-source datasets (ODIR-5k, PAPILA, GAMMA,\nIDRiD, MESSIDOR-2). Model performance was compared using area under the\nreceiver operating characteristic curve (AUC) and Z-tests with Bonferroni\ncorrection (P<0.05/3).\n Interpretation: Traditional DL models are mostly comparable to RETFound for\nocular disease detection with large datasets. However, RETFound is superior in\nsystemic disease detection with smaller datasets. These findings offer valuable\ninsights into the respective merits and limitation of traditional models and\nFMs.\n","authors":["Samantha Min Er Yew","Xiaofeng Lei","Jocelyn Hui Lin Goh","Yibing Chen","Sahana Srinivasan","Miao-li Chee","Krithi Pushpanathan","Ke Zou","Qingshan Hou","Zhi Da Soh","Cancan Xue","Marco Chak Yan Yu","Charumathi Sabanayagam","E Shyong Tai","Xueling Sim","Yaxing Wang","Jost B. Jonas","Vinay Nangia","Gabriel Dawei Yang","Emma Anran Ran","Carol Yim-Lui Cheung","Yangqin Feng","Jun Zhou","Rick Siow Mong Goh","Yukun Zhou","Pearse A. Keane","Yong Liu","Ching-Yu Cheng","Yih-Chung Tham"],"pdf_url":"https://arxiv.org/pdf/2501.12016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05406v4","updated":"2025-01-21T10:07:10Z","published":"2023-09-11T12:12:52Z","title":"Treatment-aware Diffusion Probabilistic Model for Longitudinal MRI\n Generation and Diffuse Glioma Growth Prediction","summary":" Diffuse gliomas are malignant brain tumors that grow widespread through the\nbrain. The complex interactions between neoplastic cells and normal tissue, as\nwell as the treatment-induced changes often encountered, make glioma tumor\ngrowth modeling challenging. In this paper, we present a novel end-to-end\nnetwork capable of future predictions of tumor masks and multi-parametric\nmagnetic resonance images (MRI) of how the tumor will look at any future time\npoints for different treatment plans. Our approach is based on cutting-edge\ndiffusion probabilistic models and deep-segmentation neural networks. We\nincluded sequential multi-parametric MRI and treatment information as\nconditioning inputs to guide the generative diffusion process as well as a\njoint segmentation process. This allows for tumor growth estimates and\nrealistic MRI generation at any given treatment and time point. We trained the\nmodel using real-world postoperative longitudinal MRI data with glioma tumor\ngrowth trajectories represented as tumor segmentation maps over time. The model\ndemonstrates promising performance across various tasks, including generating\nhigh-quality multi-parametric MRI with tumor masks, performing time-series\ntumor segmentations, and providing uncertainty estimates. Combined with the\ntreatment-aware generated MRI, the tumor growth predictions with uncertainty\nestimates can provide useful information for clinical decision-making.\n","authors":["Qinghui Liu","Elies Fuster-Garcia","Ivar Thokle Hovden","Bradley J MacIntosh","Edvard Grødem","Petter Brandal","Carles Lopez-Mateu","Donatas Sederevicius","Karoline Skogen","Till Schellhorn","Atle Bjørnerud","Kyrre Eeg Emblem"],"pdf_url":"https://arxiv.org/pdf/2309.05406v4.pdf","comment":"preprints in the IEEE-TMI"},{"id":"http://arxiv.org/abs/2501.11992v1","updated":"2025-01-21T09:23:22Z","published":"2025-01-21T09:23:22Z","title":"Survey on Hand Gesture Recognition from Visual Input","summary":" Hand gesture recognition has become an important research area, driven by the\ngrowing demand for human-computer interaction in fields such as sign language\nrecognition, virtual and augmented reality, and robotics. Despite the rapid\ngrowth of the field, there are few surveys that comprehensively cover recent\nresearch developments, available solutions, and benchmark datasets. This survey\naddresses this gap by examining the latest advancements in hand gesture and 3D\nhand pose recognition from various types of camera input data including RGB\nimages, depth images, and videos from monocular or multiview cameras, examining\nthe differing methodological requirements of each approach. Furthermore, an\noverview of widely used datasets is provided, detailing their main\ncharacteristics and application domains. Finally, open challenges such as\nachieving robust recognition in real-world environments, handling occlusions,\nensuring generalization across diverse users, and addressing computational\nefficiency for real-time applications are highlighted to guide future research\ndirections. By synthesizing the objectives, methodologies, and applications of\nrecent studies, this survey offers valuable insights into current trends,\nchallenges, and opportunities for future research in human hand gesture\nrecognition.\n","authors":["Manousos Linardakis","Iraklis Varlamis","Georgios Th. Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2501.11992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09411v2","updated":"2025-01-21T08:59:54Z","published":"2025-01-16T09:38:22Z","title":"Towards Robust and Realistic Human Pose Estimation via WiFi Signals","summary":" Robust WiFi-based human pose estimation is a challenging task that bridges\ndiscrete and subtle WiFi signals to human skeletons. This paper revisits this\nproblem and reveals two critical yet overlooked issues: 1) cross-domain gap,\ni.e., due to significant variations between source-target domain pose\ndistributions; and 2) structural fidelity gap, i.e., predicted skeletal poses\nmanifest distorted topology, usually with misplaced joints and disproportionate\nbone lengths. This paper fills these gaps by reformulating the task into a\nnovel two-phase framework dubbed DT-Pose: Domain-consistent representation\nlearning and Topology-constrained Pose decoding. Concretely, we first propose a\ntemporal-consistent contrastive learning strategy with uniformity\nregularization, coupled with self-supervised masking-reconstruction operations,\nto enable robust learning of domain-consistent and motion-discriminative\nWiFi-specific representations. Beyond this, we introduce a simple yet effective\npose decoder with task prompts, which integrates Graph Convolution Network\n(GCN) and Transformer layers to constrain the topology structure of the\ngenerated skeleton by exploring the adjacent-overarching relationships among\nhuman joints. Extensive experiments conducted on various benchmark datasets\nhighlight the superior performance of our method in tackling these fundamental\nchallenges in both 2D/3D human pose estimation tasks.\n","authors":["Yang Chen","Jingcai Guo","Song Guo","Jingren Zhou","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2501.09411v2.pdf","comment":"12 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.11971v1","updated":"2025-01-21T08:33:32Z","published":"2025-01-21T08:33:32Z","title":"SMamba: Sparse Mamba for Event-based Object Detection","summary":" Transformer-based methods have achieved remarkable performance in event-based\nobject detection, owing to the global modeling ability. However, they neglect\nthe influence of non-event and noisy regions and process them uniformly,\nleading to high computational overhead. To mitigate computation cost, some\nresearchers propose window attention based sparsification strategies to discard\nunimportant regions, which sacrifices the global modeling ability and results\nin suboptimal performance. To achieve better trade-off between accuracy and\nefficiency, we propose Sparse Mamba (SMamba), which performs adaptive\nsparsification to reduce computational effort while maintaining global modeling\ncapability. Specifically, a Spatio-Temporal Continuity Assessment module is\nproposed to measure the information content of tokens and discard uninformative\nones by leveraging the spatiotemporal distribution differences between activity\nand noise events. Based on the assessment results, an Information-Prioritized\nLocal Scan strategy is designed to shorten the scan distance between\nhigh-information tokens, facilitating interactions among them in the spatial\ndimension. Furthermore, to extend the global interaction from 2D space to 3D\nrepresentations, a Global Channel Interaction module is proposed to aggregate\nchannel information from a global spatial perspective. Results on three\ndatasets (Gen1, 1Mpx, and eTram) demonstrate that our model outperforms other\nmethods in both performance and efficiency.\n","authors":["Nan Yang","Yang Wang","Zhanwen Liu","Meng Li","Yisheng An","Xiangmo Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.11971v1.pdf","comment":"AAAI2025"},{"id":"http://arxiv.org/abs/2501.06714v2","updated":"2025-01-21T08:33:26Z","published":"2025-01-12T04:44:44Z","title":"F3D-Gaus: Feed-forward 3D-aware Generation on ImageNet with\n Cycle-Consistent Gaussian Splatting","summary":" This paper tackles the problem of generalizable 3D-aware generation from\nmonocular datasets, e.g., ImageNet. The key challenge of this task is learning\na robust 3D-aware representation without multi-view or dynamic data, while\nensuring consistent texture and geometry across different viewpoints. Although\nsome baseline methods are capable of 3D-aware generation, the quality of the\ngenerated images still lags behind state-of-the-art 2D generation approaches,\nwhich excel in producing high-quality, detailed images. To address this severe\nlimitation, we propose a novel feed-forward pipeline based on pixel-aligned\nGaussian Splatting, coined as F3D-Gaus, which can produce more realistic and\nreliable 3D renderings from monocular inputs. In addition, we introduce a\nself-supervised cycle-consistent constraint to enforce cross-view consistency\nin the learned 3D representation. This training strategy naturally allows\naggregation of multiple aligned Gaussian primitives and significantly\nalleviates the interpolation limitations inherent in single-view pixel-aligned\nGaussian Splatting. Furthermore, we incorporate video model priors to perform\ngeometry-aware refinement, enhancing the generation of fine details in\nwide-viewpoint scenarios and improving the model's capability to capture\nintricate 3D textures. Extensive experiments demonstrate that our approach not\nonly achieves high-quality, multi-view consistent 3D-aware generation from\nmonocular datasets, but also significantly improves training and inference\nefficiency.\n","authors":["Yuxin Wang","Qianyi Wu","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2501.06714v2.pdf","comment":"Project Page: https://w-ted.github.io/publications/F3D-Gaus"},{"id":"http://arxiv.org/abs/2501.03659v4","updated":"2025-01-21T08:09:03Z","published":"2025-01-07T09:47:46Z","title":"DehazeGS: Seeing Through Fog with 3D Gaussian Splatting","summary":" Current novel view synthesis tasks primarily rely on high-quality and clear\nimages. However, in foggy scenes, scattering and attenuation can significantly\ndegrade the reconstruction and rendering quality. Although NeRF-based dehazing\nreconstruction algorithms have been developed, their use of deep fully\nconnected neural networks and per-ray sampling strategies leads to high\ncomputational costs. Moreover, NeRF's implicit representation struggles to\nrecover fine details from hazy scenes. In contrast, recent advancements in 3D\nGaussian Splatting achieve high-quality 3D scene reconstruction by explicitly\nmodeling point clouds into 3D Gaussians. In this paper, we propose leveraging\nthe explicit Gaussian representation to explain the foggy image formation\nprocess through a physically accurate forward rendering process. We introduce\nDehazeGS, a method capable of decomposing and rendering a fog-free background\nfrom participating media using only muti-view foggy images as input. We model\nthe transmission within each Gaussian distribution to simulate the formation of\nfog. During this process, we jointly learn the atmospheric light and scattering\ncoefficient while optimizing the Gaussian representation of the hazy scene. In\nthe inference stage, we eliminate the effects of scattering and attenuation on\nthe Gaussians and directly project them onto a 2D plane to obtain a clear view.\nExperiments on both synthetic and real-world foggy datasets demonstrate that\nDehazeGS achieves state-of-the-art performance in terms of both rendering\nquality and computational efficiency. visualizations are available at\nhttps://dehazegs.github.io/\n","authors":["Jinze Yu","Yiqun Wang","Zhengda Lu","Jianwei Guo","Yong Li","Hongxing Qin","Xiaopeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.03659v4.pdf","comment":"9 pages,4 figures. visualizations are available at\n https://dehazegs.github.io/"},{"id":"http://arxiv.org/abs/2501.11927v1","updated":"2025-01-21T07:03:11Z","published":"2025-01-21T07:03:11Z","title":"A Lightweight and Interpretable Deepfakes Detection Framework","summary":" The recent realistic creation and dissemination of so-called deepfakes poses\na serious threat to social life, civil rest, and law. Celebrity defaming,\nelection manipulation, and deepfakes as evidence in court of law are few\npotential consequences of deepfakes. The availability of open source trained\nmodels based on modern frameworks such as PyTorch or TensorFlow, video\nmanipulations Apps such as FaceApp and REFACE, and economical computing\ninfrastructure has easen the creation of deepfakes. Most of the existing\ndetectors focus on detecting either face-swap, lip-sync, or puppet master\ndeepfakes, but a unified framework to detect all three types of deepfakes is\nhardly explored. This paper presents a unified framework that exploits the\npower of proposed feature fusion of hybrid facial landmarks and our novel heart\nrate features for detection of all types of deepfakes. We propose novel heart\nrate features and fused them with the facial landmark features to better\nextract the facial artifacts of fake videos and natural variations available in\nthe original videos. We used these features to train a light-weight XGBoost to\nclassify between the deepfake and bonafide videos. We evaluated the performance\nof our framework on the world leaders dataset (WLDR) that contains all types of\ndeepfakes. Experimental results illustrate that the proposed framework offers\nsuperior detection performance over the comparative deepfakes detection\nmethods. Performance comparison of our framework against the LSTM-FCN, a\ncandidate of deep learning model, shows that proposed model achieves similar\nresults, however, it is more interpretable.\n","authors":["Muhammad Umar Farooq","Ali Javed","Khalid Mahmood Malik","Muhammad Anas Raza"],"pdf_url":"https://arxiv.org/pdf/2501.11927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11923v1","updated":"2025-01-21T06:55:31Z","published":"2025-01-21T06:55:31Z","title":"Progressive Cross Attention Network for Flood Segmentation using\n Multispectral Satellite Imagery","summary":" In recent years, the integration of deep learning techniques with remote\nsensing technology has revolutionized the way natural hazards, such as floods,\nare monitored and managed. However, existing methods for flood segmentation\nusing remote sensing data often overlook the utility of correlative features\namong multispectral satellite information. In this study, we introduce a\nprogressive cross attention network (ProCANet), a deep learning model that\nprogressively applies both self- and cross-attention mechanisms to\nmultispectral features, generating optimal feature combinations for flood\nsegmentation. The proposed model was compared with state-of-the-art approaches\nusing Sen1Floods11 dataset and our bespoke flood data generated for the Citarum\nRiver basin, Indonesia. Our model demonstrated superior performance with the\nhighest Intersection over Union (IoU) score of 0.815. Our results in this\nstudy, coupled with the ablation assessment comparing scenarios with and\nwithout attention across various modalities, opens a promising path for\nenhancing the accuracy of flood analysis using remote sensing technology.\n","authors":["Vicky Feliren","Fithrothul Khikmah","Irfan Dwiki Bhaswara","Bahrul I. Nasution","Alex M. Lechner","Muhamad Risqi U. Saputra"],"pdf_url":"https://arxiv.org/pdf/2501.11923v1.pdf","comment":"5 pages, 4 figures, published in IEEE Geoscience and Remote Sensing\n Letters"},{"id":"http://arxiv.org/abs/2312.02253v2","updated":"2025-01-21T06:03:07Z","published":"2023-12-04T18:35:27Z","title":"Diversify, Don't Fine-Tune: Scaling Up Visual Recognition Training with\n Synthetic Images","summary":" Recent advances in generative deep learning have enabled the creation of\nhigh-quality synthetic images in text-to-image generation. Prior work shows\nthat fine-tuning a pretrained diffusion model on ImageNet and generating\nsynthetic training images from the finetuned model can enhance an ImageNet\nclassifier's performance. However, performance degrades as synthetic images\noutnumber real ones. In this paper, we explore whether generative fine-tuning\nis essential for this improvement and whether it is possible to further scale\nup training using more synthetic data. We present a new framework leveraging\noff-the-shelf generative models to generate synthetic training images,\naddressing multiple challenges: class name ambiguity, lack of diversity in\nnaive prompts, and domain shifts. Specifically, we leverage large language\nmodels (LLMs) and CLIP to resolve class name ambiguity. To diversify images, we\npropose contextualized diversification (CD) and stylized diversification (SD)\nmethods, also prompted by LLMs. Finally, to mitigate domain shifts, we leverage\ndomain adaptation techniques with auxiliary batch normalization for synthetic\nimages. Our framework consistently enhances recognition model performance with\nmore synthetic data, up to 6x of original ImageNet size showcasing the\npotential of synthetic data for improved recognition models and strong\nout-of-domain generalization.\n","authors":["Zhuoran Yu","Chenchen Zhu","Sean Culatana","Raghuraman Krishnamoorthi","Fanyi Xiao","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2312.02253v2.pdf","comment":"Accepted by Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2410.05413v3","updated":"2025-01-21T05:43:51Z","published":"2024-10-07T18:26:30Z","title":"Implicitly Learned Neural Phase Functions for Basis-Free Point Spread\n Function Engineering","summary":" Point spread function (PSF) engineering is vital for precisely controlling\nthe focus of light in computational imaging, with applications in neural\nimaging, fluorescence microscopy, and biophotonics. The PSF is derived from the\nmagnitude of the Fourier transform of a phase function, making the construction\nof the phase function given the PSF (PSF engineering) an ill-posed inverse\nproblem. Traditional PSF engineering methods rely on physical basis functions,\nlimiting their ability to generalize across the range of PSFs required for\nimaging tasks. We introduce a novel approach leveraging implicit neural\nrepresentations that overcome the limitations of pixel-wise optimization\nmethods. Our approach achieves a median MSSIM of 0.8162 and a mean MSSIM of\n0.5634, compared to a median MSSIM of 0.0 and a mean MSSIM of 0.1841 with\npixel-wise optimization when learning randomly generated phase functions. Our\napproach also achieves a median PSNR of 10.38 dB and a mean PSNR of 8.672 dB,\ncompared to a median PSNR of 6.653 dB and a mean PSNR of 6.660 dB with\npixel-wise optimization for this task.\n","authors":["Aleksey Valouev"],"pdf_url":"https://arxiv.org/pdf/2410.05413v3.pdf","comment":"3 pages, 7 figures. To be published in ICVISP 2024\n (https://www.icvisp.org/)"},{"id":"http://arxiv.org/abs/2501.11901v1","updated":"2025-01-21T05:41:09Z","published":"2025-01-21T05:41:09Z","title":"Enhancing Adversarial Transferability via Component-Wise Augmentation\n Method","summary":" Deep Neural Networks (DNNs) are highly vulnerable to adversarial examples,\nwhich pose significant challenges in security-sensitive applications. Among\nvarious adversarial attack strategies, input transformation-based attacks have\ndemonstrated remarkable effectiveness in enhancing adversarial transferability.\nHowever, existing methods fail to diversify attention regions across models\nadequately and introduce excessive information loss during transformations. In\nthis paper, we introduce a novel input transformation-based method, termed\nComponent-Wise Augmentation (CWA), designed to enhance transferability by\nlocally applying block-wise transformations. CWA strategically integrates\ninterpolation and selective rotation on individual image blocks to diversify\nmodel attention regions while preserving semantic integrity. Extensive\nexperiments on the standard ImageNet dataset show that CWA consistently\noutperforms state-of-the-art methods in both attack success rates and stability\nacross CNN- and Transformer-based models, while also demonstrating superior\nperformance against multiple defense methods.\n","authors":["Hangyu Liu","Bo Peng","Pengxiang Ding","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2501.11901v1.pdf","comment":"13pages,5 figures"},{"id":"http://arxiv.org/abs/2407.21004v3","updated":"2025-01-21T05:31:45Z","published":"2024-07-30T17:51:44Z","title":"Evolver: Chain-of-Evolution Prompting to Boost Large Multimodal Models\n for Hateful Meme Detection","summary":" Recent advances show that two-stream approaches have achieved outstanding\nperformance in hateful meme detection. However, hateful memes constantly evolve\nas new memes emerge by fusing progressive cultural ideas, making existing\nmethods obsolete or ineffective. In this work, we explore the potential of\nLarge Multimodal Models (LMMs) for hateful meme detection. To this end, we\npropose Evolver, which incorporates LMMs via Chain-of-Evolution (CoE)\nPrompting, by integrating the evolution attribute and in-context information of\nmemes. Specifically, Evolver simulates the evolving and expressing process of\nmemes and reasons through LMMs in a step-by-step manner. First, an evolutionary\npair mining module retrieves the top-k most similar memes in the external\ncurated meme set with the input meme. Second, an evolutionary information\nextractor is designed to summarize the semantic regularities between the paired\nmemes for prompting. Finally, a contextual relevance amplifier enhances the\nin-context hatefulness information to boost the search for evolutionary\nprocesses. Extensive experiments on public FHM, MAMI, and HarM datasets show\nthat CoE prompting can be incorporated into existing LMMs to improve their\nperformance. More encouragingly, it can serve as an interpretive tool to\npromote the understanding of the evolution of social memes. [Homepage]\n(https://github.com/inFaaa/Evolver)\n","authors":["Jinfa Huang","Jinsheng Pan","Zhongwei Wan","Hanjia Lyu","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2407.21004v3.pdf","comment":"accepted by COLING 2025"},{"id":"http://arxiv.org/abs/2501.11899v1","updated":"2025-01-21T05:29:34Z","published":"2025-01-21T05:29:34Z","title":"LASER: Lip Landmark Assisted Speaker Detection for Robustness","summary":" Active Speaker Detection (ASD) aims to identify speaking individuals in\ncomplex visual scenes. While humans can easily detect speech by matching lip\nmovements to audio, current ASD models struggle to establish this\ncorrespondence, often misclassifying non-speaking instances when audio and lip\nmovements are unsynchronized. To address this limitation, we propose Lip\nlandmark Assisted Speaker dEtection for Robustness (LASER). Unlike models that\nrely solely on facial frames, LASER explicitly focuses on lip movements by\nintegrating lip landmarks in training. Specifically, given a face track, LASER\nextracts frame-level visual features and the 2D coordinates of lip landmarks\nusing a lightweight detector. These coordinates are encoded into dense feature\nmaps, providing spatial and structural information on lip positions.\nRecognizing that landmark detectors may sometimes fail under challenging\nconditions (e.g., low resolution, occlusions, extreme angles), we incorporate\nan auxiliary consistency loss to align predictions from both lip-aware and\nface-only features, ensuring reliable performance even when lip data is absent.\nExtensive experiments across multiple datasets show that LASER outperforms\nstate-of-the-art models, especially in scenarios with desynchronized audio and\nvisuals, demonstrating robust performance in real-world video contexts. Code is\navailable at \\url{https://github.com/plnguyen2908/LASER_ASD}.\n","authors":["Le Thien Phuc Nguyen","Zhuoran Yu","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2501.11899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11895v1","updated":"2025-01-21T05:15:10Z","published":"2025-01-21T05:15:10Z","title":"Contrastive Masked Autoencoders for Character-Level Open-Set Writer\n Identification","summary":" In the realm of digital forensics and document authentication, writer\nidentification plays a crucial role in determining the authors of documents\nbased on handwriting styles. The primary challenge in writer-id is the\n\"open-set scenario\", where the goal is accurately recognizing writers unseen\nduring the model training. To overcome this challenge, representation learning\nis the key. This method can capture unique handwriting features, enabling it to\nrecognize styles not previously encountered during training. Building on this\nconcept, this paper introduces the Contrastive Masked Auto-Encoders (CMAE) for\nCharacter-level Open-Set Writer Identification. We merge Masked Auto-Encoders\n(MAE) with Contrastive Learning (CL) to simultaneously and respectively capture\nsequential information and distinguish diverse handwriting styles.\nDemonstrating its effectiveness, our model achieves state-of-the-art (SOTA)\nresults on the CASIA online handwriting dataset, reaching an impressive\nprecision rate of 89.7%. Our study advances universal writer-id with a\nsophisticated representation learning approach, contributing substantially to\nthe ever-evolving landscape of digital handwriting analysis, and catering to\nthe demands of an increasingly interconnected world.\n","authors":["Xiaowei Jiang","Wenhao Ma","Yiqun Duan","Thomas Do","Chin-Teng Lin"],"pdf_url":"https://arxiv.org/pdf/2501.11895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11884v1","updated":"2025-01-21T04:35:27Z","published":"2025-01-21T04:35:27Z","title":"Fast Underwater Scene Reconstruction using Multi-View Stereo and\n Physical Imaging","summary":" Underwater scene reconstruction poses a substantial challenge because of the\nintricate interplay between light and the medium, resulting in scattering and\nabsorption effects that make both depth estimation and rendering more complex.\nWhile recent Neural Radiance Fields (NeRF) based methods for underwater scenes\nachieve high-quality results by modeling and separating the scattering medium,\nthey still suffer from slow training and rendering speeds. To address these\nlimitations, we propose a novel method that integrates Multi-View Stereo (MVS)\nwith a physics-based underwater image formation model. Our approach consists of\ntwo branches: one for depth estimation using the traditional cost volume\npipeline of MVS, and the other for rendering based on the physics-based image\nformation model. The depth branch improves scene geometry, while the medium\nbranch determines the scattering parameters to achieve precise scene rendering.\nUnlike traditional MVSNet methods that rely on ground-truth depth, our method\ndoes not necessitate the use of depth truth, thus allowing for expedited\ntraining and rendering processes. By leveraging the medium subnet to estimate\nthe medium parameters and combining this with a color MLP for rendering, we\nrestore the true colors of underwater scenes and achieve higher-fidelity\ngeometric representations. Experimental results show that our method enables\nhigh-quality synthesis of novel views in scattering media, clear views\nrestoration by removing the medium, and outperforms existing methods in\nrendering quality and training efficiency.\n","authors":["Shuyi Hu","Qi Liu"],"pdf_url":"https://arxiv.org/pdf/2501.11884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11876v1","updated":"2025-01-21T04:11:04Z","published":"2025-01-21T04:11:04Z","title":"FNIN: A Fourier Neural Operator-based Numerical Integration Network for\n Surface-form-gradients","summary":" Surface-from-gradients (SfG) aims to recover a three-dimensional (3D) surface\nfrom its gradients. Traditional methods encounter significant challenges in\nachieving high accuracy and handling high-resolution inputs, particularly\nfacing the complex nature of discontinuities and the inefficiencies associated\nwith large-scale linear solvers. Although recent advances in deep learning,\nsuch as photometric stereo, have enhanced normal estimation accuracy, they do\nnot fully address the intricacies of gradient-based surface reconstruction. To\novercome these limitations, we propose a Fourier neural operator-based\nNumerical Integration Network (FNIN) within a two-stage optimization framework.\nIn the first stage, our approach employs an iterative architecture for\nnumerical integration, harnessing an advanced Fourier neural operator to\napproximate the solution operator in Fourier space. Additionally, a\nself-learning attention mechanism is incorporated to effectively detect and\nhandle discontinuities. In the second stage, we refine the surface\nreconstruction by formulating a weighted least squares problem, addressing the\nidentified discontinuities rationally. Extensive experiments demonstrate that\nour method achieves significant improvements in both accuracy and efficiency\ncompared to current state-of-the-art solvers. This is particularly evident in\nhandling high-resolution images with complex data, achieving errors of fewer\nthan 0.1 mm on tested objects.\n","authors":["Jiaqi Leng","Yakun Ju","Yuanxu Duan","Jiangnan Zhang","Qingxuan Lv","Zuxuan Wu","Hao Fan"],"pdf_url":"https://arxiv.org/pdf/2501.11876v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2408.11051v2","updated":"2025-01-21T04:06:09Z","published":"2024-08-20T17:57:46Z","title":"FLAME: Learning to Navigate with Multimodal LLM in Urban Environments","summary":" Large Language Models (LLMs) have demonstrated potential in\nVision-and-Language Navigation (VLN) tasks, yet current applications face\nchallenges. While LLMs excel in general conversation scenarios, they struggle\nwith specialized navigation tasks, yielding suboptimal performance compared to\nspecialized VLN models. We introduce FLAME (FLAMingo-Architected Embodied\nAgent), a novel Multimodal LLM-based agent and architecture designed for urban\nVLN tasks that efficiently handles multiple observations. Our approach\nimplements a three-phase tuning technique for effective adaptation to\nnavigation tasks, including single perception tuning for street view\ndescription, multiple perception tuning for route summarization, and end-to-end\ntraining on VLN datasets. The augmented datasets are synthesized automatically.\nExperimental results demonstrate FLAME's superiority over existing methods,\nsurpassing state-of-the-art methods by a 7.3% increase in task completion on\nTouchdown dataset. This work showcases the potential of Multimodal LLMs (MLLMs)\nin complex navigation tasks, representing an advancement towards applications\nof MLLMs in the field of embodied intelligence.\n","authors":["Yunzhe Xu","Yiyuan Pan","Zhe Liu","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11051v2.pdf","comment":"Accepted to AAAI 2025 (Oral)"},{"id":"http://arxiv.org/abs/2412.10718v4","updated":"2025-01-21T04:00:36Z","published":"2024-12-14T07:22:03Z","title":"Grid: Omni Visual Generation","summary":" Visual generation has witnessed remarkable progress in single-image tasks,\nyet extending these capabilities to temporal sequences remains challenging.\nCurrent approaches either build specialized video models from scratch with\nenormous computational costs or add separate motion modules to image\ngenerators, both requiring learning temporal dynamics anew. We observe that\nmodern image generation models possess underutilized potential in handling\nstructured layouts with implicit temporal understanding. Building on this\ninsight, we introduce GRID, which reformulates temporal sequences as grid\nlayouts, enabling holistic processing of visual sequences while leveraging\nexisting model capabilities. Through a parallel flow-matching training strategy\nwith coarse-to-fine scheduling, our approach achieves up to 67 faster inference\nspeeds while using <1/1000 of the computational resources compared to\nspecialized models. Extensive experiments demonstrate that GRID not only excels\nin temporal tasks from Text-to-Video to 3D Editing but also preserves strong\nperformance in image generation, establishing itself as an efficient and\nversatile omni-solution for visual generation.\n","authors":["Cong Wan","Xiangyang Luo","Hao Luo","Zijian Cai","Yiren Song","Yunlong Zhao","Yifan Bai","Yuhang He","Yihong Gong"],"pdf_url":"https://arxiv.org/pdf/2412.10718v4.pdf","comment":"Codes: https://github.com/Should-AI-Lab/GRID"},{"id":"http://arxiv.org/abs/2405.16960v2","updated":"2025-01-21T03:49:48Z","published":"2024-05-27T08:55:17Z","title":"DCPI-Depth: Explicitly Infusing Dense Correspondence Prior to\n Unsupervised Monocular Depth Estimation","summary":" There has been a recent surge of interest in learning to perceive depth from\nmonocular videos in an unsupervised fashion. A key challenge in this field is\nachieving robust and accurate depth estimation in challenging scenarios,\nparticularly in regions with weak textures or where dynamic objects are\npresent. This study makes three major contributions by delving deeply into\ndense correspondence priors to provide existing frameworks with explicit\ngeometric constraints. The first novelty is a contextual-geometric depth\nconsistency loss, which employs depth maps triangulated from dense\ncorrespondences based on estimated ego-motion to guide the learning of depth\nperception from contextual information, since explicitly triangulated depth\nmaps capture accurate relative distances among pixels. The second novelty\narises from the observation that there exists an explicit, deducible\nrelationship between optical flow divergence and depth gradient. A differential\nproperty correlation loss is, therefore, designed to refine depth estimation\nwith a specific emphasis on local variations. The third novelty is a\nbidirectional stream co-adjustment strategy that enhances the interaction\nbetween rigid and optical flows, encouraging the former towards more accurate\ncorrespondence and making the latter more adaptable across various scenarios\nunder the static scene hypotheses. DCPI-Depth, a framework that incorporates\nall these innovative components and couples two bidirectional and collaborative\nstreams, achieves state-of-the-art performance and generalizability across\nmultiple public datasets, outperforming all existing prior arts. Specifically,\nit demonstrates accurate depth estimation in texture-less and dynamic regions,\nand shows more reasonable smoothness. Our source code will be publicly\navailable at mias.group/DCPI-Depth upon publication.\n","authors":["Mengtan Zhang","Yi Feng","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2405.16960v2.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2412.09658v2","updated":"2025-01-21T03:43:41Z","published":"2024-12-12T08:23:07Z","title":"SEGT: A General Spatial Expansion Group Transformer for nuScenes\n Lidar-based Object Detection Task","summary":" In the technical report, we present a novel transformer-based framework for\nnuScenes lidar-based object detection task, termed Spatial Expansion Group\nTransformer (SEGT). To efficiently handle the irregular and sparse nature of\npoint cloud, we propose migrating the voxels into distinct specialized ordered\nfields with the general spatial expansion strategies, and employ group\nattention mechanisms to extract the exclusive feature maps within each field.\nSubsequently, we integrate the feature representations across different ordered\nfields by alternately applying diverse expansion strategies, thereby enhancing\nthe model's ability to capture comprehensive spatial information. The method\nwas evaluated on the nuScenes lidar-based object detection test dataset,\nachieving an NDS score of 73.9 without Test-Time Augmentation (TTA) and 74.5\nwith TTA, demonstrating the effectiveness of the proposed method. Notably, our\nmethod ranks the 1st place in the nuScenes lidar-based object detection task.\n","authors":["Cheng Mei","Hao He","Yahui Liu","Zhenhua Guo"],"pdf_url":"https://arxiv.org/pdf/2412.09658v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01097v2","updated":"2025-01-21T03:32:22Z","published":"2025-01-02T06:46:13Z","title":"EliGen: Entity-Level Controlled Image Generation with Regional Attention","summary":" Recent advancements in diffusion models have significantly advanced\ntext-to-image generation, yet global text prompts alone remain insufficient for\nachieving fine-grained control over individual entities within an image. To\naddress this limitation, we present EliGen, a novel framework for Entity-Level\ncontrolled Image Generation. We introduce regional attention, a mechanism for\ndiffusion transformers that requires no additional parameters, seamlessly\nintegrating entity prompts and arbitrary-shaped spatial masks. By contributing\na high-quality dataset with fine-grained spatial and semantic entity-level\nannotations, we train EliGen to achieve robust and accurate entity-level\nmanipulation, surpassing existing methods in both spatial precision and image\nquality. Additionally, we propose an inpainting fusion pipeline, extending\nEliGen's capabilities to multi-entity image inpainting tasks. We further\ndemonstrate its flexibility by integrating it with other open-source models\nsuch as IP-Adapter, In-Context LoRA and MLLM, unlocking new creative\npossibilities. The source code, model, and dataset are published at\nhttps://github.com/modelscope/DiffSynth-Studio.\n","authors":["Hong Zhang","Zhongjie Duan","Xingjun Wang","Yingda Chen","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.01097v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11858v1","updated":"2025-01-21T03:22:10Z","published":"2025-01-21T03:22:10Z","title":"EmbodiedEval: Evaluate Multimodal LLMs as Embodied Agents","summary":" Multimodal Large Language Models (MLLMs) have shown significant advancements,\nproviding a promising future for embodied agents. Existing benchmarks for\nevaluating MLLMs primarily utilize static images or videos, limiting\nassessments to non-interactive scenarios. Meanwhile, existing embodied AI\nbenchmarks are task-specific and not diverse enough, which do not adequately\nevaluate the embodied capabilities of MLLMs. To address this, we propose\nEmbodiedEval, a comprehensive and interactive evaluation benchmark for MLLMs\nwith embodied tasks. EmbodiedEval features 328 distinct tasks within 125 varied\n3D scenes, each of which is rigorously selected and annotated. It covers a\nbroad spectrum of existing embodied AI tasks with significantly enhanced\ndiversity, all within a unified simulation and evaluation framework tailored\nfor MLLMs. The tasks are organized into five categories: navigation, object\ninteraction, social interaction, attribute question answering, and spatial\nquestion answering to assess different capabilities of the agents. We evaluated\nthe state-of-the-art MLLMs on EmbodiedEval and found that they have a\nsignificant shortfall compared to human level on embodied tasks. Our analysis\ndemonstrates the limitations of existing MLLMs in embodied capabilities,\nproviding insights for their future development. We open-source all evaluation\ndata and simulation framework at https://github.com/thunlp/EmbodiedEval.\n","authors":["Zhili Cheng","Yuge Tu","Ran Li","Shiqi Dai","Jinyi Hu","Shengding Hu","Jiahao Li","Yang Shi","Tianyu Yu","Weize Chen","Lei Shi","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2501.11858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11854v1","updated":"2025-01-21T03:10:52Z","published":"2025-01-21T03:10:52Z","title":"WaveNet-SF: A Hybrid Network for Retinal Disease Detection Based on\n Wavelet Transform in the Spatial-Frequency Domain","summary":" Retinal diseases are a leading cause of vision impairment and blindness, with\ntimely diagnosis being critical for effective treatment. Optical Coherence\nTomography (OCT) has become a standard imaging modality for retinal disease\ndiagnosis, but OCT images often suffer from issues such as speckle noise,\ncomplex lesion shapes, and varying lesion sizes, making interpretation\nchallenging. In this paper, we propose a novel framework, WaveNet-SF, to\nenhance retinal disease detection by integrating spatial-domain and\nfrequency-domain learning. The framework utilizes wavelet transforms to\ndecompose OCT images into low- and high-frequency components, enabling the\nmodel to extract both global structural features and fine-grained details. To\nimprove lesion detection, we introduce a multi-scale wavelet spatial attention\n(MSW-SA) module, which enhances the model's focus on regions of interest at\nmultiple scales. Additionally, a high-frequency feature compensation block\n(HFFC) is incorporated to recover edge information lost during wavelet\ndecomposition, suppress noise, and preserve fine details crucial for lesion\ndetection. Our approach achieves state-of-the-art (SOTA) classification\naccuracies of 97.82% and 99. 58% on the OCT-C8 and OCT2017 datasets,\nrespectively, surpassing existing methods. These results demonstrate the\nefficacy of WaveNet-SF in addressing the challenges of OCT image analysis and\nits potential as a powerful tool for retinal disease diagnosis.\n","authors":["Jilan Cheng","Guoli Long","Zeyu Zhang","Zhenjia Qi","Hanyu Wang","Libin Lu","Shuihua Wang","Yudong Zhang","Jin Hong"],"pdf_url":"https://arxiv.org/pdf/2501.11854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.10734v3","updated":"2025-01-21T03:00:04Z","published":"2024-12-14T08:08:40Z","title":"OmniHD-Scenes: A Next-Generation Multimodal Dataset for Autonomous\n Driving","summary":" The rapid advancement of deep learning has intensified the need for\ncomprehensive data for use by autonomous driving algorithms. High-quality\ndatasets are crucial for the development of effective data-driven autonomous\ndriving solutions. Next-generation autonomous driving datasets must be\nmultimodal, incorporating data from advanced sensors that feature extensive\ndata coverage, detailed annotations, and diverse scene representation. To\naddress this need, we present OmniHD-Scenes, a large-scale multimodal dataset\nthat provides comprehensive omnidirectional high-definition data. The\nOmniHD-Scenes dataset combines data from 128-beam LiDAR, six cameras, and six\n4D imaging radar systems to achieve full environmental perception. The dataset\ncomprises 1501 clips, each approximately 30-s long, totaling more than 450K\nsynchronized frames and more than 5.85 million synchronized sensor data points.\nWe also propose a novel 4D annotation pipeline. To date, we have annotated 200\nclips with more than 514K precise 3D bounding boxes. These clips also include\nsemantic segmentation annotations for static scene elements. Additionally, we\nintroduce a novel automated pipeline for generation of the dense occupancy\nground truth, which effectively leverages information from non-key frames.\nAlongside the proposed dataset, we establish comprehensive evaluation metrics,\nbaseline models, and benchmarks for 3D detection and semantic occupancy\nprediction. These benchmarks utilize surround-view cameras and 4D imaging radar\nto explore cost-effective sensor solutions for autonomous driving applications.\nExtensive experiments demonstrate the effectiveness of our low-cost sensor\nconfiguration and its robustness under adverse conditions. Data will be\nreleased at https://www.2077ai.com/OmniHD-Scenes.\n","authors":["Lianqing Zheng","Long Yang","Qunshu Lin","Wenjin Ai","Minghao Liu","Shouyi Lu","Jianan Liu","Hongze Ren","Jingyue Mo","Xiaokai Bai","Jie Bai","Zhixiong Ma","Xichan Zhu"],"pdf_url":"https://arxiv.org/pdf/2412.10734v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11841v1","updated":"2025-01-21T02:51:10Z","published":"2025-01-21T02:51:10Z","title":"Survey on Monocular Metric Depth Estimation","summary":" Monocular Depth Estimation (MDE) is a fundamental computer vision task\nunderpinning applications such as spatial understanding, 3D reconstruction, and\nautonomous driving. While deep learning-based MDE methods can predict relative\ndepth from a single image, their lack of metric scale information often results\nin scale inconsistencies, limiting their utility in downstream tasks like\nvisual SLAM, 3D reconstruction, and novel view synthesis. Monocular Metric\nDepth Estimation (MMDE) addresses these challenges by enabling precise,\nscene-scale depth inference. MMDE improves depth consistency, enhances\nsequential task stability, simplifies integration into downstream applications,\nand broadens practical use cases. This paper provides a comprehensive review of\ndepth estimation technologies, highlighting the evolution from geometry-based\nmethods to state-of-the-art deep learning approaches. It emphasizes\nadvancements in scale-agnostic methods, which are crucial for enabling\nzero-shot generalization as the foundational capability for MMDE. Recent\nprogress in zero-shot MMDE research is explored, focusing on challenges such as\nmodel generalization and the loss of detail at scene boundaries. Innovative\nstrategies to address these issues include unlabelled data augmentation, image\npatching, architectural optimization, and generative techniques. These\nadvancements, analyzed in detail, demonstrate significant contributions to\novercoming existing limitations. Finally, this paper synthesizes recent\ndevelopments in zero-shot MMDE, identifies unresolved challenges, and outlines\nfuture research directions. By offering a clear roadmap and cutting-edge\ninsights, this work aims to deepen understanding of MMDE, inspire novel\napplications, and drive technological innovation.\n","authors":["Jiuling Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.11841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11836v1","updated":"2025-01-21T02:44:05Z","published":"2025-01-21T02:44:05Z","title":"Data-driven Detection and Evaluation of Damages in Concrete Structures:\n Using Deep Learning and Computer Vision","summary":" Structural integrity is vital for maintaining the safety and longevity of\nconcrete infrastructures such as bridges, tunnels, and walls. Traditional\nmethods for detecting damages like cracks and spalls are labor-intensive,\ntime-consuming, and prone to human error. To address these challenges, this\nstudy explores advanced data-driven techniques using deep learning for\nautomated damage detection and analysis. Two state-of-the-art instance\nsegmentation models, YOLO-v7 instance segmentation and Mask R-CNN, were\nevaluated using a dataset comprising 400 images, augmented to 10,995 images\nthrough geometric and color-based transformations to enhance robustness. The\nmodels were trained and validated using a dataset split into 90% training set,\nvalidation and test set 10%. Performance metrics such as precision, recall,\nmean average precision (mAP@0.5), and frames per second (FPS) were used for\nevaluation. YOLO-v7 achieved a superior mAP@0.5 of 96.1% and processed 40 FPS,\noutperforming Mask R-CNN, which achieved a mAP@0.5 of 92.1% with a slower\nprocessing speed of 18 FPS. The findings recommend YOLO-v7 instance\nsegmentation model for real-time, high-speed structural health monitoring,\nwhile Mask R-CNN is better suited for detailed offline assessments. This study\ndemonstrates the potential of deep learning to revolutionize infrastructure\nmaintenance, offering a scalable and efficient solution for automated damage\ndetection.\n","authors":["Saeid Ataei","Saeed Adibnazari","Seyyed Taghi Ataei"],"pdf_url":"https://arxiv.org/pdf/2501.11836v1.pdf","comment":"17 pages, 10 figures. This study focuses on the data-driven detection\n and evaluation of damages in concrete structures using deep learning and\n computer vision techniques"},{"id":"http://arxiv.org/abs/2404.10292v2","updated":"2025-01-21T02:43:24Z","published":"2024-04-16T05:29:14Z","title":"From Data Deluge to Data Curation: A Filtering-WoRA Paradigm for\n Efficient Text-based Person Search","summary":" In text-based person search endeavors, data generation has emerged as a\nprevailing practice, addressing concerns over privacy preservation and the\narduous task of manual annotation. Although the number of synthesized data can\nbe infinite in theory, the scientific conundrum persists that how much\ngenerated data optimally fuels subsequent model training. We observe that only\na subset of the data in these constructed datasets plays a decisive role.\nTherefore, we introduce a new Filtering-WoRA paradigm, which contains a\nfiltering algorithm to identify this crucial data subset and WoRA (Weighted\nLow-Rank Adaptation) learning strategy for light fine-tuning. The filtering\nalgorithm is based on the cross-modality relevance to remove the lots of coarse\nmatching synthesis pairs. As the number of data decreases, we do not need to\nfine-tune the entire model. Therefore, we propose a WoRA learning strategy to\nefficiently update a minimal portion of model parameters. WoRA streamlines the\nlearning process, enabling heightened efficiency in extracting knowledge from\nfewer, yet potent, data instances. Extensive experimentation validates the\nefficacy of pretraining, where our model achieves advanced and efficient\nretrieval performance on challenging real-world benchmarks. Notably, on the\nCUHK-PEDES dataset, we have achieved a competitive mAP of 67.02% while reducing\nmodel training time by 19.82%.\n","authors":["Jintao Sun","Hao Fei","Zhedong Zheng","Gangyi Ding"],"pdf_url":"https://arxiv.org/pdf/2404.10292v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11210v3","updated":"2025-01-21T02:10:01Z","published":"2024-06-17T05:03:44Z","title":"Zero-Shot Scene Change Detection","summary":" We present a novel, training-free approach to scene change detection. Our\nmethod leverages tracking models, which inherently perform change detection\nbetween consecutive frames of video by identifying common objects and detecting\nnew or missing objects. Specifically, our method takes advantage of the change\ndetection effect of the tracking model by inputting reference and query images\ninstead of consecutive frames. Furthermore, we focus on the content gap and\nstyle gap between two input images in change detection, and address both issues\nby proposing adaptive content threshold and style bridging layers,\nrespectively. Finally, we extend our approach to video, leveraging rich\ntemporal information to enhance the performance of scene change detection. We\ncompare our approach and baseline through various experiments. While existing\ntrain-based baseline tend to specialize only in the trained domain, our method\nshows consistent performance across various domains, proving the\ncompetitiveness of our approach.\n","authors":["Kyusik Cho","Dong Yeop Kim","Euntai Kim"],"pdf_url":"https://arxiv.org/pdf/2406.11210v3.pdf","comment":"AAAI 2025. Code available at: https://github.com/kyusik-cho/ZSSCD"},{"id":"http://arxiv.org/abs/2501.11815v1","updated":"2025-01-21T01:45:56Z","published":"2025-01-21T01:45:56Z","title":"CogMorph: Cognitive Morphing Attacks for Text-to-Image Models","summary":" The development of text-to-image (T2I) generative models, that enable the\ncreation of high-quality synthetic images from textual prompts, has opened new\nfrontiers in creative design and content generation. However, this paper\nreveals a significant and previously unrecognized ethical risk inherent in this\ntechnology and introduces a novel method, termed the Cognitive Morphing Attack\n(CogMorph), which manipulates T2I models to generate images that retain the\noriginal core subjects but embeds toxic or harmful contextual elements. This\nnuanced manipulation exploits the cognitive principle that human perception of\nconcepts is shaped by the entire visual scene and its context, producing images\nthat amplify emotional harm far beyond attacks that merely preserve the\noriginal semantics. To address this, we first construct an imagery toxicity\ntaxonomy spanning 10 major and 48 sub-categories, aligned with human\ncognitive-perceptual dimensions, and further build a toxicity risk matrix\nresulting in 1,176 high-quality T2I toxic prompts. Based on this, our CogMorph\nfirst introduces Cognitive Toxicity Augmentation, which develops a cognitive\ntoxicity knowledge base with rich external toxic representations for humans\n(e.g., fine-grained visual features) that can be utilized to further guide the\noptimization of adversarial prompts. In addition, we present Contextual\nHierarchical Morphing, which hierarchically extracts critical parts of the\noriginal prompt (e.g., scenes, subjects, and body parts), and then iteratively\nretrieves and fuses toxic features to inject harmful contexts. Extensive\nexperiments on multiple open-sourced T2I models and black-box commercial APIs\n(e.g., DALLE-3) demonstrate the efficacy of CogMorph which significantly\noutperforms other baselines by large margins (+20.62\\% on average).\n","authors":["Zonglei Jing","Zonghao Ying","Le Wang","Siyuan Liang","Aishan Liu","Xianglong Liu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2501.11815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10826v2","updated":"2025-01-21T01:28:53Z","published":"2024-03-16T06:26:52Z","title":"MambaMOT: State-Space Model as Motion Predictor for Multi-Object\n Tracking","summary":" In the field of multi-object tracking (MOT), traditional methods often rely\non the Kalman filter for motion prediction, leveraging its strengths in linear\nmotion scenarios. However, the inherent limitations of these methods become\nevident when confronted with complex, nonlinear motions and occlusions\nprevalent in dynamic environments like sports and dance. This paper explores\nthe possibilities of replacing the Kalman filter with a learning-based motion\nmodel that effectively enhances tracking accuracy and adaptability beyond the\nconstraints of Kalman filter-based tracker. In this paper, our proposed method\nMambaMOT and MambaMOT+, demonstrate advanced performance on challenging MOT\ndatasets such as DanceTrack and SportsMOT, showcasing their ability to handle\nintricate, non-linear motion patterns and frequent occlusions more effectively\nthan traditional methods.\n","authors":["Hsiang-Wei Huang","Cheng-Yen Yang","Wenhao Chai","Zhongyu Jiang","Jenq-Neng Hwang"],"pdf_url":"https://arxiv.org/pdf/2403.10826v2.pdf","comment":"Accepted by ICASSP 2025. Previous version paper title: Exploring\n Learning-based Motion Models in Multi-Object Tracking"},{"id":"http://arxiv.org/abs/2501.09672v2","updated":"2025-01-21T01:04:52Z","published":"2025-01-16T17:08:12Z","title":"Robin: a Suite of Multi-Scale Vision-Language Models and the CHIRP\n Evaluation Benchmark","summary":" The proliferation of Vision-Language Models (VLMs) in the past several years\ncalls for rigorous and comprehensive evaluation methods and benchmarks. This\nwork analyzes existing VLM evaluation techniques, including automated metrics,\nAI-based assessments, and human evaluations across diverse tasks. We first\nintroduce Robin - a novel suite of VLMs that we built by combining Large\nLanguage Models (LLMs) and Vision Encoders (VEs) at multiple scales, and use\nRobin to identify shortcomings of current evaluation approaches across scales.\nNext, to overcome the identified limitations, we introduce CHIRP - a new long\nform response benchmark we developed for more robust and complete VLM\nevaluation. We provide open access to the Robin training code, model suite, and\nCHIRP benchmark to promote reproducibility and advance VLM research.\n","authors":["Alexis Roger","Prateek Humane","Daniel Z. Kaplan","Kshitij Gupta","Qi Sun","George Adamopoulos","Jonathan Siu Chi Lim","Quentin Anthony","Edwin Fennell","Irina Rish"],"pdf_url":"https://arxiv.org/pdf/2501.09672v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08506v2","updated":"2025-01-21T01:01:20Z","published":"2025-01-15T00:56:59Z","title":"Exploring the Efficacy of Meta-Learning: Unveiling Superior Data\n Diversity Utilization of MAML Over Pre-training","summary":" Currently, data and model size dominate the narrative in the training of\nsuper-large, powerful models. However, there has been a lack of exploration on\nthe effect of other attributes of the training dataset on model performance. We\nhypothesize that dataset diversity can impact the performance of vision models.\nOur study shows positive correlations between test set accuracy and data\ndiversity, providing an argument for furthering the research of dataset\nattributes beyond size. We analyzed pre-training and model-agnostic\nmeta-learning methods on twelve popular visual datasets (e.g., Omniglot,\nCIFAR-FS, Aircraft) and five model configurations, including MAML variants with\ndifferent numbers of inner gradient steps and supervised learning. We show\nmoderate to strong positive correlations (R-squared: 0.15-0.42) between\naccuracy and data diversity and weaker but significant correlations (R-squared:\n~0.2) between loss and diversity. These findings support our hypothesis and\ndemonstrate a promising way for a deeper exploration of how formal data\ndiversity influences model performance. This initial study highlights the\npotential of (Task2Vec) data diversity as a valuable measure in the rapidly\nevolving field of large-scale learning and emphasizes that understanding the\ndataset is key to building more powerful and generalizable models.\n","authors":["Kavita Selva","Satita Vittayaareekul","Brando Miranda"],"pdf_url":"https://arxiv.org/pdf/2501.08506v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11800v1","updated":"2025-01-21T00:34:32Z","published":"2025-01-21T00:34:32Z","title":"TFLOP: Table Structure Recognition Framework with Layout Pointer\n Mechanism","summary":" Table Structure Recognition (TSR) is a task aimed at converting table images\ninto a machine-readable format (e.g. HTML), to facilitate other applications\nsuch as information retrieval. Recent works tackle this problem by identifying\nthe HTML tags and text regions, where the latter is used for text extraction\nfrom the table document. These works however, suffer from misalignment issues\nwhen mapping text into the identified text regions. In this paper, we introduce\na new TSR framework, called TFLOP (TSR Framework with LayOut Pointer\nmechanism), which reformulates the conventional text region prediction and\nmatching into a direct text region pointing problem. Specifically, TFLOP\nutilizes text region information to identify both the table's structure tags\nand its aligned text regions, simultaneously. Without the need for region\nprediction and alignment, TFLOP circumvents the additional text region matching\nstage, which requires finely-calibrated post-processing. TFLOP also employs\nspan-aware contrastive supervision to enhance the pointing mechanism in tables\nwith complex structure. As a result, TFLOP achieves the state-of-the-art\nperformance across multiple benchmarks such as PubTabNet, FinTabNet, and\nSynthTabNet. In our extensive experiments, TFLOP not only exhibits competitive\nperformance but also shows promising results on industrial document TSR\nscenarios such as documents with watermarks or in non-English domain.\n","authors":["Minsoo Khang","Teakgyu Hong"],"pdf_url":"https://arxiv.org/pdf/2501.11800v1.pdf","comment":"Published in IJCAI Proceedings 2024"},{"id":"http://arxiv.org/abs/2501.11795v1","updated":"2025-01-21T00:07:55Z","published":"2025-01-21T00:07:55Z","title":"Provably effective detection of effective data poisoning attacks","summary":" This paper establishes a mathematically precise definition of dataset\npoisoning attack and proves that the very act of effectively poisoning a\ndataset ensures that the attack can be effectively detected. On top of a\nmathematical guarantee that dataset poisoning is identifiable by a new\nstatistical test that we call the Conformal Separability Test, we provide\nexperimental evidence that we can adequately detect poisoning attempts in the\nreal world.\n","authors":["Jonathan Gallagher","Yasaman Esfandiari","Callen MacPhee","Michael Warren"],"pdf_url":"https://arxiv.org/pdf/2501.11795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09412v2","updated":"2025-01-21T23:23:29Z","published":"2024-09-14T10:59:25Z","title":"Label Convergence: Defining an Upper Performance Bound in Object\n Recognition through Contradictory Annotations","summary":" Annotation errors are a challenge not only during training of machine\nlearning models, but also during their evaluation. Label variations and\ninaccuracies in datasets often manifest as contradictory examples that deviate\nfrom established labeling conventions. Such inconsistencies, when significant,\nprevent models from achieving optimal performance on metrics such as mean\nAverage Precision (mAP). We introduce the notion of \"label convergence\" to\ndescribe the highest achievable performance under the constraint of\ncontradictory test annotations, essentially defining an upper bound on model\naccuracy.\n Recognizing that noise is an inherent characteristic of all data, our study\nanalyzes five real-world datasets, including the LVIS dataset, to investigate\nthe phenomenon of label convergence. We approximate that label convergence is\nbetween 62.63-67.52 mAP@[0.5:0.95:0.05] for LVIS with 95% confidence,\nattributing these bounds to the presence of real annotation errors. With\ncurrent state-of-the-art (SOTA) models at the upper end of the label\nconvergence interval for the well-studied LVIS dataset, we conclude that model\ncapacity is sufficient to solve current object detection problems. Therefore,\nfuture efforts should focus on three key aspects: (1) updating the problem\nspecification and adjusting evaluation practices to account for unavoidable\nlabel noise, (2) creating cleaner data, especially test data, and (3) including\nmulti-annotated data to investigate annotation variation and make these issues\nvisible from the outset.\n","authors":["David Tschirschwitz","Volker Rodehorst"],"pdf_url":"https://arxiv.org/pdf/2409.09412v2.pdf","comment":"Accepted at WACV 2025, added reference to paper associated code"},{"id":"http://arxiv.org/abs/2501.12535v1","updated":"2025-01-21T22:57:09Z","published":"2025-01-21T22:57:09Z","title":"How Does the Spatial Distribution of Pre-training Data Affect Geospatial\n Foundation Models?","summary":" Foundation models have made rapid advances in many domains including Earth\nobservation, where Geospatial Foundation Models (GFMs) can help address global\nchallenges such as climate change, agriculture, and disaster response. Previous\nwork on GFMs focused on tailoring model architecture and pre-text tasks, and\ndid not investigate the impact of pre-training data selection on model\nperformance. However, recent works from other domains show that the\npre-training data distribution is an important factor influencing the\nperformance of the foundation models. With this motivation, our research\nexplores how the geographic distribution of pre-training data affects the\nperformance of GFMs. We evaluated several pre-training data distributions by\nsampling different compositions from a global data pool. Our experiments with\ntwo GFMs on downstream tasks indicate that balanced and globally representative\ndata compositions often outperform region-specific sampling, highlighting the\nimportance of diversity and global coverage in pre-training data. Our results\nsuggest that the most appropriate data sampling technique may depend on the\nspecific GFM architecture. These findings will support the development of\nrobust GFMs by incorporating quality pre-training data distributions,\nultimately improving machine learning solutions for Earth observation.\n","authors":["Mirali Purohit","Gedeon Muhawenayo","Esther Rolf","Hannah Kerner"],"pdf_url":"https://arxiv.org/pdf/2501.12535v1.pdf","comment":"Accepted at Good Data for Generative AI @ AAAI 2025"},{"id":"http://arxiv.org/abs/2501.12524v1","updated":"2025-01-21T22:28:22Z","published":"2025-01-21T22:28:22Z","title":"Efficient Lung Ultrasound Severity Scoring Using Dedicated Feature\n Extractor","summary":" With the advent of the COVID-19 pandemic, ultrasound imaging has emerged as a\npromising technique for COVID-19 detection, due to its non-invasive nature,\naffordability, and portability. In response, researchers have focused on\ndeveloping AI-based scoring systems to provide real-time diagnostic support.\nHowever, the limited size and lack of proper annotation in publicly available\nultrasound datasets pose significant challenges for training a robust AI model.\nThis paper proposes MeDiVLAD, a novel pipeline to address the above issue for\nmulti-level lung-ultrasound (LUS) severity scoring. In particular, we leverage\nself-knowledge distillation to pretrain a vision transformer (ViT) without\nlabel and aggregate frame-level features via dual-level VLAD aggregation. We\nshow that with minimal finetuning, MeDiVLAD outperforms conventional\nfully-supervised methods in both frame- and video-level scoring, while offering\nclassification reasoning with exceptional quality. This superior performance\nenables key applications such as the automatic identification of critical lung\npathology areas and provides a robust solution for broader medical video\nclassification tasks.\n","authors":["Jiaqi Guo","Yunnan Wu","Evangelos Kaimakamis","Georgios Petmezas","Vasileios E. Papageorgiou","Nicos Maglaveras","Aggelos K. Katsaggelos"],"pdf_url":"https://arxiv.org/pdf/2501.12524v1.pdf","comment":"Accepted by IEEE ISBI 2025"},{"id":"http://arxiv.org/abs/2410.11783v2","updated":"2025-01-21T21:46:26Z","published":"2024-10-15T17:02:32Z","title":"LatentBKI: Open-Dictionary Continuous Mapping in Visual-Language Latent\n Spaces with Quantifiable Uncertainty","summary":" This paper introduces a novel probabilistic mapping algorithm, LatentBKI,\nwhich enables open-vocabulary mapping with quantifiable uncertainty.\nTraditionally, semantic mapping algorithms focus on a fixed set of semantic\ncategories which limits their applicability for complex robotic tasks.\nVision-Language (VL) models have recently emerged as a technique to jointly\nmodel language and visual features in a latent space, enabling semantic\nrecognition beyond a predefined, fixed set of semantic classes. LatentBKI\nrecurrently incorporates neural embeddings from VL models into a voxel map with\nquantifiable uncertainty, leveraging the spatial correlations of nearby\nobservations through Bayesian Kernel Inference (BKI). LatentBKI is evaluated\nagainst similar explicit semantic mapping and VL mapping frameworks on the\npopular Matterport3D and Semantic KITTI datasets, demonstrating that LatentBKI\nmaintains the probabilistic benefits of continuous mapping with the additional\nbenefit of open-dictionary queries. Real-world experiments demonstrate\napplicability to challenging indoor environments.\n","authors":["Joey Wilson","Ruihan Xu","Yile Sun","Parker Ewen","Minghan Zhu","Kira Barton","Maani Ghaffari"],"pdf_url":"https://arxiv.org/pdf/2410.11783v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05072v2","updated":"2025-01-21T21:33:06Z","published":"2024-04-07T21:00:14Z","title":"Spatial Cognition from Egocentric Video: Out of Sight, Not Out of Mind","summary":" As humans move around, performing their daily tasks, they are able to recall\nwhere they have positioned objects in their environment, even if these objects\nare currently out of their sight. In this paper, we aim to mimic this spatial\ncognition ability. We thus formulate the task of Out of Sight, Not Out of Mind\n- 3D tracking active objects using observations captured through an egocentric\ncamera. We introduce a simple but effective approach to address this\nchallenging problem, called Lift, Match, and Keep (LMK). LMK lifts partial 2D\nobservations to 3D world coordinates, matches them over time using visual\nappearance, 3D location and interactions to form object tracks, and keeps these\nobject tracks even when they go out-of-view of the camera. We benchmark LMK on\n100 long videos from EPIC-KITCHENS. Our results demonstrate that spatial\ncognition is critical for correctly locating objects over short and long time\nscales. E.g., for one long egocentric video, we estimate the 3D location of 50\nactive objects. After 120 seconds, 57% of the objects are correctly localised\nby LMK, compared to just 33% by a recent 3D method for egocentric videos and\n17% by a general 2D tracking method.\n","authors":["Chiara Plizzari","Shubham Goel","Toby Perrett","Jacob Chalk","Angjoo Kanazawa","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2404.05072v2.pdf","comment":"Accepted at 3DV 2025. 14 pages including references and appendix.\n Project Webpage: http://dimadamen.github.io/OSNOM/"},{"id":"http://arxiv.org/abs/2501.12489v1","updated":"2025-01-21T20:30:51Z","published":"2025-01-21T20:30:51Z","title":"Large-image Object Detection for Fine-grained Recognition of Punches\n Patterns in Medieval Panel Painting","summary":" The attribution of the author of an art piece is typically a laborious manual\nprocess, usually relying on subjective evaluations of expert figures. However,\nthere are some situations in which quantitative features of the artwork can\nsupport these evaluations. The extraction of these features can sometimes be\nautomated, for instance, with the use of Machine Learning (ML) techniques. An\nexample of these features is represented by repeated, mechanically impressed\npatterns, called punches, present chiefly in 13th and 14th-century panel\npaintings from Tuscany. Previous research in art history showcased a strong\nconnection between the shapes of punches and specific artists or workshops,\nsuggesting the possibility of using these quantitative cues to support the\nattribution. In the present work, we first collect a dataset of large-scale\nimages of these panel paintings. Then, using YOLOv10, a recent and popular\nobject detection model, we train a ML pipeline to perform object detection on\nthe punches contained in the images. Due to the large size of the images, the\ndetection procedure is split across multiple frames by adopting a\nsliding-window approach with overlaps, after which the predictions are combined\nfor the whole image using a custom non-maximal suppression routine. Our results\nindicate how art historians working in the field can reliably use our method\nfor the identification and extraction of punches.\n","authors":["Josh Bruegger","Diana Ioana Catana","Vanja Macovaz","Matias Valdenegro-Toro","Matthia Sabatelli","Marco Zullich"],"pdf_url":"https://arxiv.org/pdf/2501.12489v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12488v1","updated":"2025-01-21T20:30:15Z","published":"2025-01-21T20:30:15Z","title":"Bidirectional Brain Image Translation using Transfer Learning from\n Generic Pre-trained Models","summary":" Brain imaging plays a crucial role in the diagnosis and treatment of various\nneurological disorders, providing valuable insights into the structure and\nfunction of the brain. Techniques such as magnetic resonance imaging (MRI) and\ncomputed tomography (CT) enable non-invasive visualization of the brain, aiding\nin the understanding of brain anatomy, abnormalities, and functional\nconnectivity. However, cost and radiation dose may limit the acquisition of\nspecific image modalities, so medical image synthesis can be used to generate\nrequired medical images without actual addition. In the medical domain, where\nobtaining labeled medical images is labor-intensive and expensive, addressing\ndata scarcity is a major challenge. Recent studies propose using transfer\nlearning to overcome this issue. This involves adapting pre-trained CycleGAN\nmodels, initially trained on non-medical data, to generate realistic medical\nimages. In this work, transfer learning was applied to the task of MR-CT image\ntranslation and vice versa using 18 pre-trained non-medical models, and the\nmodels were fine-tuned to have the best result. The models' performance was\nevaluated using four widely used image quality metrics:\nPeak-signal-to-noise-ratio, Structural Similarity Index, Universal Quality\nIndex, and Visual Information Fidelity. Quantitative evaluation and qualitative\nperceptual analysis by radiologists demonstrate the potential of transfer\nlearning in medical imaging and the effectiveness of the generic pre-trained\nmodel. The results provide compelling evidence of the model's exceptional\nperformance, which can be attributed to the high quality and similarity of the\ntraining images to actual human brain images. These results underscore the\nsignificance of carefully selecting appropriate and representative training\nimages to optimize performance in brain image analysis tasks.\n","authors":["Fatima Haimour","Rizik Al-Sayyed","Waleed Mahafza","Omar S. Al-Kadi"],"pdf_url":"https://arxiv.org/pdf/2501.12488v1.pdf","comment":"19 pages, 9 figures, 6 tables"},{"id":"http://arxiv.org/abs/2501.12487v1","updated":"2025-01-21T20:23:22Z","published":"2025-01-21T20:23:22Z","title":"fabSAM: A Farmland Boundary Delineation Method Based on the Segment\n Anything Model","summary":" Delineating farmland boundaries is essential for agricultural management such\nas crop monitoring and agricultural census. Traditional methods using remote\nsensing imagery have been efficient but limited in generalisation. The Segment\nAnything Model (SAM), known for its impressive zero shot performance, has been\nadapted for remote sensing tasks through prompt learning and fine tuning. Here,\nwe propose a SAM based farmland boundary delineation framework 'fabSAM' that\ncombines a Deeplabv3+ based Prompter and SAM. Also, a fine tuning strategy was\nintroduced to enable SAMs decoder to improve the use of prompt information.\nExperimental results on the AI4Boundaries and AI4SmallFarms datasets have shown\nthat fabSAM has a significant improvement in farmland region identification and\nboundary delineation. Compared to zero shot SAM, fabSAM surpassed it by 23.5%\nand 15.1% in mIOU on the AI4Boundaries and AI4SmallFarms datasets,\nrespectively. For Deeplabv3+, fabSAM outperformed it by 4.9% and 12.5% in mIOU,\nrespectively. These results highlight the effectiveness of fabSAM, which also\nmeans that we can more easily obtain the global farmland region and boundary\nmaps from open source satellite image datasets like Sentinel2.\n","authors":["Yufeng Xie","Hanzhi Wu","Hongxiang Tong","Lei Xiao","Wenwen Zhou","Ling Li","Thomas Cherico Wanger"],"pdf_url":"https://arxiv.org/pdf/2501.12487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12482v1","updated":"2025-01-21T20:20:34Z","published":"2025-01-21T20:20:34Z","title":"TOFFE -- Temporally-binned Object Flow from Events for High-speed and\n Energy-Efficient Object Detection and Tracking","summary":" Object detection and tracking is an essential perception task for enabling\nfully autonomous navigation in robotic systems. Edge robot systems such as\nsmall drones need to execute complex maneuvers at high-speeds with limited\nresources, which places strict constraints on the underlying algorithms and\nhardware. Traditionally, frame-based cameras are used for vision-based\nperception due to their rich spatial information and simplified synchronous\nsensing capabilities. However, obtaining detailed information across frames\nincurs high energy consumption and may not even be required. In addition, their\nlow temporal resolution renders them ineffective in high-speed motion\nscenarios. Event-based cameras offer a biologically-inspired solution to this\nby capturing only changes in intensity levels at exceptionally high temporal\nresolution and low power consumption, making them ideal for high-speed motion\nscenarios. However, their asynchronous and sparse outputs are not natively\nsuitable with conventional deep learning methods. In this work, we propose\nTOFFE, a lightweight hybrid framework for performing event-based object motion\nestimation (including pose, direction, and speed estimation), referred to as\nObject Flow. TOFFE integrates bio-inspired Spiking Neural Networks (SNNs) and\nconventional Analog Neural Networks (ANNs), to efficiently process events at\nhigh temporal resolutions while being simple to train. Additionally, we present\na novel event-based synthetic dataset involving high-speed object motion to\ntrain TOFFE. Our experimental results show that TOFFE achieves 5.7x/8.3x\nreduction in energy consumption and 4.6x/5.8x reduction in latency on edge\nGPU(Jetson TX2)/hybrid hardware(Loihi-2 and Jetson TX2), compared to previous\nevent-based object detection baselines.\n","authors":["Adarsh Kumar Kosta","Amogh Joshi","Arjun Roy","Rohan Kumar Manna","Manish Nagaraj","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2501.12482v1.pdf","comment":"8 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2408.05938v2","updated":"2025-01-21T20:14:02Z","published":"2024-08-12T06:25:44Z","title":"Deep Geometric Moments Promote Shape Consistency in Text-to-3D\n Generation","summary":" To address the data scarcity associated with 3D assets, 2D-lifting techniques\nsuch as Score Distillation Sampling (SDS) have become a widely adopted practice\nin text-to-3D generation pipelines. However, the diffusion models used in these\ntechniques are prone to viewpoint bias and thus lead to geometric\ninconsistencies such as the Janus problem. To counter this, we introduce MT3D,\na text-to-3D generative model that leverages a high-fidelity 3D object to\novercome viewpoint bias and explicitly infuse geometric understanding into the\ngeneration pipeline. Firstly, we employ depth maps derived from a high-quality\n3D model as control signals to guarantee that the generated 2D images preserve\nthe fundamental shape and structure, thereby reducing the inherent viewpoint\nbias. Next, we utilize deep geometric moments to ensure geometric consistency\nin the 3D representation explicitly. By incorporating geometric details from a\n3D asset, MT3D enables the creation of diverse and geometrically consistent\nobjects, thereby improving the quality and usability of our 3D representations.\nProject page and code: https://moment-3d.github.io/\n","authors":["Utkarsh Nath","Rajeev Goel","Eun Som Jeon","Changhoon Kim","Kyle Min","Yezhou Yang","Yingzhen Yang","Pavan Turaga"],"pdf_url":"https://arxiv.org/pdf/2408.05938v2.pdf","comment":"This paper has been accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2501.12477v1","updated":"2025-01-21T19:59:22Z","published":"2025-01-21T19:59:22Z","title":"Slot-BERT: Self-supervised Object Discovery in Surgical Video","summary":" Object-centric slot attention is a powerful framework for unsupervised\nlearning of structured and explainable representations that can support\nreasoning about objects and actions, including in surgical videos. While\nconventional object-centric methods for videos leverage recurrent processing to\nachieve efficiency, they often struggle with maintaining long-range temporal\ncoherence required for long videos in surgical applications. On the other hand,\nfully parallel processing of entire videos enhances temporal consistency but\nintroduces significant computational overhead, making it impractical for\nimplementation on hardware in medical facilities. We present Slot-BERT, a\nbidirectional long-range model that learns object-centric representations in a\nlatent space while ensuring robust temporal coherence. Slot-BERT scales object\ndiscovery seamlessly to long videos of unconstrained lengths. A novel slot\ncontrastive loss further reduces redundancy and improves the representation\ndisentanglement by enhancing slot orthogonality. We evaluate Slot-BERT on\nreal-world surgical video datasets from abdominal, cholecystectomy, and\nthoracic procedures. Our method surpasses state-of-the-art object-centric\napproaches under unsupervised training achieving superior performance across\ndiverse domains. We also demonstrate efficient zero-shot domain adaptation to\ndata from diverse surgical specialties and databases.\n","authors":["Guiqiu Liao","Matjaz Jogan","Marcel Hussing","Kenta Nakahashi","Kazuhiro Yasufuku","Amin Madani","Eric Eaton","Daniel A. Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2501.12477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15020v3","updated":"2025-01-21T19:32:07Z","published":"2024-05-23T19:51:33Z","title":"AdjointDEIS: Efficient Gradients for Diffusion Models","summary":" The optimization of the latents and parameters of diffusion models with\nrespect to some differentiable metric defined on the output of the model is a\nchallenging and complex problem. The sampling for diffusion models is done by\nsolving either the probability flow ODE or diffusion SDE wherein a neural\nnetwork approximates the score function allowing a numerical ODE/SDE solver to\nbe used. However, naive backpropagation techniques are memory intensive,\nrequiring the storage of all intermediate states, and face additional\ncomplexity in handling the injected noise from the diffusion term of the\ndiffusion SDE. We propose a novel family of bespoke ODE solvers to the\ncontinuous adjoint equations for diffusion models, which we call AdjointDEIS.\nWe exploit the unique construction of diffusion SDEs to further simplify the\nformulation of the continuous adjoint equations using exponential integrators.\nMoreover, we provide convergence order guarantees for our bespoke solvers.\nSignificantly, we show that continuous adjoint equations for diffusion SDEs\nactually simplify to a simple ODE. Lastly, we demonstrate the effectiveness of\nAdjointDEIS for guided generation with an adversarial attack in the form of the\nface morphing problem. Our code will be released at https:\n//github.com/zblasingame/AdjointDEIS.\n","authors":["Zander W. Blasingame","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2405.15020v3.pdf","comment":"NeurIPS 2024 conference paper"},{"id":"http://arxiv.org/abs/2501.12433v1","updated":"2025-01-21T18:41:28Z","published":"2025-01-21T18:41:28Z","title":"Owls are wise and foxes are unfaithful: Uncovering animal stereotypes in\n vision-language models","summary":" Animal stereotypes are deeply embedded in human culture and language. They\noften shape our perceptions and expectations of various species. Our study\ninvestigates how animal stereotypes manifest in vision-language models during\nthe task of image generation. Through targeted prompts, we explore whether\nDALL-E perpetuates stereotypical representations of animals, such as \"owls as\nwise,\" \"foxes as unfaithful,\" etc. Our findings reveal significant stereotyped\ninstances where the model consistently generates images aligned with cultural\nbiases. The current work is the first of its kind to examine animal\nstereotyping in vision-language models systematically and to highlight a\ncritical yet underexplored dimension of bias in AI-generated visual content.\n","authors":["Tabinda Aman","Mohammad Nadeem","Shahab Saquib Sohail","Mohammad Anas","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2501.12433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09764v2","updated":"2025-01-21T17:39:12Z","published":"2023-09-18T13:44:36Z","title":"Application-driven Validation of Posteriors in Inverse Problems","summary":" Current deep learning-based solutions for image analysis tasks are commonly\nincapable of handling problems to which multiple different plausible solutions\nexist. In response, posterior-based methods such as conditional Diffusion\nModels and Invertible Neural Networks have emerged; however, their translation\nis hampered by a lack of research on adequate validation. In other words, the\nway progress is measured often does not reflect the needs of the driving\npractical application. Closing this gap in the literature, we present the first\nsystematic framework for the application-driven validation of posterior-based\nmethods in inverse problems. As a methodological novelty, it adopts key\nprinciples from the field of object detection validation, which has a long\nhistory of addressing the question of how to locate and match multiple object\ninstances in an image. Treating modes as instances enables us to perform\nmode-centric validation, using well-interpretable metrics from the application\nperspective. We demonstrate the value of our framework through instantiations\nfor a synthetic toy example and two medical vision use cases: pose estimation\nin surgery and imaging-based quantification of functional tissue parameters for\ndiagnostics. Our framework offers key advantages over common approaches to\nposterior validation in all three examples and could thus revolutionize\nperformance assessment in inverse problems.\n","authors":["Tim J. Adler","Jan-Hinrich Nölke","Annika Reinke","Minu Dietlinde Tizabi","Sebastian Gruber","Dasha Trofimova","Lynton Ardizzone","Paul F. Jaeger","Florian Buettner","Ullrich Köthe","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2309.09764v2.pdf","comment":"Accepted at Medical Image Analysis. Shared first authors: Tim J.\n Adler and Jan-Hinrich N\\\"olke. 24 pages, 9 figures, 1 table"},{"id":"http://arxiv.org/abs/2304.04521v4","updated":"2025-01-21T17:01:33Z","published":"2023-04-10T11:35:42Z","title":"GL-MCM: Global and Local Maximum Concept Matching for Zero-Shot\n Out-of-Distribution Detection","summary":" Zero-shot out-of-distribution (OOD) detection is a task that detects OOD\nimages during inference with only in-distribution (ID) class names. Existing\nmethods assume ID images contain a single, centered object, and do not consider\nthe more realistic multi-object scenarios, where both ID and OOD objects are\npresent. To meet the needs of many users, the detection method must have the\nflexibility to adapt the type of ID images. To this end, we present\nGlobal-Local Maximum Concept Matching (GL-MCM), which incorporates local image\nscores as an auxiliary score to enhance the separability of global and local\nvisual features. Due to the simple ensemble score function design, GL-MCM can\ncontrol the type of ID images with a single weight parameter. Experiments on\nImageNet and multi-object benchmarks demonstrate that GL-MCM outperforms\nbaseline zero-shot methods and is comparable to fully supervised methods.\nFurthermore, GL-MCM offers strong flexibility in adjusting the target type of\nID images. The code is available via https://github.com/AtsuMiyai/GL-MCM.\n","authors":["Atsuyuki Miyai","Qing Yu","Go Irie","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2304.04521v4.pdf","comment":"Accepted by International Journal of Computer Vision (IJCV) 2025"}]},"2025-01-22T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.13072v1","updated":"2025-01-22T18:34:51Z","published":"2025-01-22T18:34:51Z","title":"AdaWM: Adaptive World Model based Planning for Autonomous Driving","summary":" World model based reinforcement learning (RL) has emerged as a promising\napproach for autonomous driving, which learns a latent dynamics model and uses\nit to train a planning policy. To speed up the learning process, the\npretrain-finetune paradigm is often used, where online RL is initialized by a\npretrained model and a policy learned offline. However, naively performing such\ninitialization in RL may result in dramatic performance degradation during the\nonline interactions in the new task. To tackle this challenge, we first analyze\nthe performance degradation and identify two primary root causes therein: the\nmismatch of the planning policy and the mismatch of the dynamics model, due to\ndistribution shift. We further analyze the effects of these factors on\nperformance degradation during finetuning, and our findings reveal that the\nchoice of finetuning strategies plays a pivotal role in mitigating these\neffects. We then introduce AdaWM, an Adaptive World Model based planning\nmethod, featuring two key steps: (a) mismatch identification, which quantifies\nthe mismatches and informs the finetuning strategy, and (b) alignment-driven\nfinetuning, which selectively updates either the policy or the model as needed\nusing efficient low-rank updates. Extensive experiments on the challenging\nCARLA driving tasks demonstrate that AdaWM significantly improves the\nfinetuning process, resulting in more robust and efficient performance in\nautonomous driving systems.\n","authors":["Hang Wang","Xin Ye","Feng Tao","Abhirup Mallik","Burhaneddin Yaman","Liu Ren","Junshan Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.13072v1.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2403.01536v2","updated":"2025-01-22T17:06:19Z","published":"2024-03-03T15:30:31Z","title":"Fast Ergodic Search with Kernel Functions","summary":" Ergodic search enables optimal exploration of an information distribution\nwhile guaranteeing the asymptotic coverage of the search space. However,\ncurrent methods typically have exponential computation complexity in the search\nspace dimension and are restricted to Euclidean space. We introduce a\ncomputationally efficient ergodic search method. Our contributions are\ntwo-fold. First, we develop a kernel-based ergodic metric and generalize it\nfrom Euclidean space to Lie groups. We formally prove the proposed metric is\nconsistent with the standard ergodic metric while guaranteeing linear\ncomplexity in the search space dimension. Secondly, we derive the first-order\noptimality condition of the kernel ergodic metric for nonlinear systems, which\nenables efficient trajectory optimization. Comprehensive numerical benchmarks\nshow that the proposed method is at least two orders of magnitude faster than\nthe state-of-the-art algorithm. Finally, we demonstrate the proposed algorithm\nwith a peg-in-hole insertion task. We formulate the problem as a coverage task\nin the space of SE(3) and use a 30-second-long human demonstration as the prior\ndistribution for ergodic coverage. Ergodicity guarantees the asymptotic\nsolution of the peg-in-hole problem so long as the solution resides within the\nprior information distribution, which is seen in the 100% success rate.\n","authors":["Max Muchen Sun","Ayush Gaggar","Peter Trautman","Todd Murphey"],"pdf_url":"https://arxiv.org/pdf/2403.01536v2.pdf","comment":"Accepted to IEEE Transactions on Robotics (T-RO). 20 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.12869v1","updated":"2025-01-22T13:25:52Z","published":"2025-01-22T13:25:52Z","title":"Drone Carrier: An Integrated Unmanned Surface Vehicle for Autonomous\n Inspection and Intervention in GNSS-Denied Maritime Environment","summary":" This paper introduces an innovative drone carrier concept that is applied in\nmaritime port security or offshore rescue. This system works with a\nheterogeneous system consisting of multiple Unmanned Aerial Vehicles (UAVs) and\nUnmanned Surface Vehicles (USVs) to perform inspection and intervention tasks\nin GNSS-denied or interrupted environments. The carrier, an electric catamaran\nmeasuring 4m by 7m, features a 4m by 6m deck supporting automated takeoff and\nlanding for four DJI M300 drones, along with a 10kg-payload manipulator\noperable in up to level 3 sea conditions. Utilizing an offshore gimbal camera\nfor navigation, the carrier can autonomously navigate, approach and dock with\nnon-cooperative vessels, guided by an onboard camera, LiDAR, and Doppler\nVelocity Log (DVL) over a 3 km$^2$ area. UAVs equipped with onboard\nUltra-Wideband (UWB) technology execute mapping, detection, and manipulation\ntasks using a versatile gripper designed for wet, saline conditions.\nAdditionally, two UAVs can coordinate to transport large objects to the\nmanipulator or interact directly with them. These procedures are fully\nautomated and were successfully demonstrated at the Mohammed Bin Zayed\nInternational Robotic Competition (MBZIRC2024), where the drone carrier\nequipped with four UAVS and one manipulator, automatically accomplished the\nintervention tasks in sea-level-3 (wave height 1.25m) based on the rough target\ninformation.\n","authors":["Yihao Dong","Muhayyu Ud Din","Francesco Lagala","Hailiang Kuang","Jianjun Sun","Siyuan Yang","Irfan Hussain","Shaoming He"],"pdf_url":"https://arxiv.org/pdf/2501.12869v1.pdf","comment":"15 pages, 12pages"},{"id":"http://arxiv.org/abs/2501.12812v1","updated":"2025-01-22T11:42:19Z","published":"2025-01-22T11:42:19Z","title":"PSGSL: A Probabilistic Framework Integrating Semantic Scene\n Understanding and Gas Sensing for Gas Source Localization","summary":" Semantic scene understanding allows a robotic agent to reason about problems\nin complex ways, using information from multiple and varied sensors to make\ndeductions about a particular matter. As a result, this form of intelligent\nrobotics is capable of performing more complex tasks and achieving more precise\nresults than simpler approaches based on single data sources. However, these\nimproved capabilities come at the cost of higher complexity, both computational\nand in terms of design. Due to the increased design complexity, formal\napproaches for exploiting semantic understanding become necessary.\n We present here a probabilistic formulation for integrating semantic\nknowledge into the process of gas source localization (GSL). The problem of GSL\nposes many unsolved challenges, and proposed solutions need to contend with the\nconstraining limitations of sensing hardware. By exploiting semantic scene\nunderstanding, we can leverage other sources of information, such as vision, to\nimprove the estimation of the source location. We show how our formulation can\nbe applied to pre-existing GSL algorithms and the effect that including\nsemantic data has on the produced estimations of the location of the source.\n","authors":["Pepe Ojeda","Javier Monroy","Javier Gonzalez-Jimenez"],"pdf_url":"https://arxiv.org/pdf/2501.12812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12799v1","updated":"2025-01-22T11:13:31Z","published":"2025-01-22T11:13:31Z","title":"Int2Planner: An Intention-based Multi-modal Motion Planner for\n Integrated Prediction and Planning","summary":" Motion planning is a critical module in autonomous driving, with the primary\nchallenge of uncertainty caused by interactions with other participants. As\nmost previous methods treat prediction and planning as separate tasks, it is\ndifficult to model these interactions. Furthermore, since the route path\nnavigates ego vehicles to a predefined destination, it provides relatively\nstable intentions for ego vehicles and helps constrain uncertainty. On this\nbasis, we construct Int2Planner, an \\textbf{Int}ention-based\n\\textbf{Int}egrated motion \\textbf{Planner} achieves multi-modal planning and\nprediction. Instead of static intention points, Int2Planner utilizes route\nintention points for ego vehicles and generates corresponding planning\ntrajectories for each intention point to facilitate multi-modal planning. The\nexperiments on the private dataset and the public nuPlan benchmark show the\neffectiveness of route intention points, and Int2Planner achieves\nstate-of-the-art performance. We also deploy it in real-world vehicles and have\nconducted autonomous driving for hundreds of kilometers in urban areas. It\nfurther verifies that Int2Planner can continuously interact with the traffic\nenvironment. Code will be avaliable at https://github.com/cxlz/Int2Planner.\n","authors":["Xiaolei Chen","Junchi Yan","Wenlong Liao","Tao He","Pai Peng"],"pdf_url":"https://arxiv.org/pdf/2501.12799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12764v1","updated":"2025-01-22T10:00:28Z","published":"2025-01-22T10:00:28Z","title":"Grid-based Submap Joining: An Efficient Algorithm for Simultaneously\n Optimizing Global Occupancy Map and Local Submap Frames","summary":" Optimizing robot poses and the map simultaneously has been shown to provide\nmore accurate SLAM results. However, for non-feature based SLAM approaches,\ndirectly optimizing all the robot poses and the whole map will greatly increase\nthe computational cost, making SLAM problems difficult to solve in large-scale\nenvironments. To solve the 2D non-feature based SLAM problem in large-scale\nenvironments more accurately and efficiently, we propose the grid-based submap\njoining method. Specifically, we first formulate the 2D grid-based submap\njoining problem as a non-linear least squares (NLLS) form to optimize the\nglobal occupancy map and local submap frames simultaneously. We then prove that\nin solving the NLLS problem using Gauss-Newton (GN) method, the increments of\nthe poses in each iteration are independent of the occupancy values of the\nglobal occupancy map. Based on this property, we propose a poseonly GN\nalgorithm equivalent to full GN method to solve the NLLS problem. The proposed\nsubmap joining algorithm is very efficient due to the independent property and\nthe pose-only solution. Evaluations using simulations and publicly available\npractical 2D laser datasets confirm the outperformance of our proposed method\ncompared to the state-of-the-art methods in terms of efficiency and accuracy,\nas well as the ability to solve the grid-based SLAM problem in very large-scale\nenvironments.\n","authors":["Yingyu Wang","Liang Zhao","Shoudong Huang"],"pdf_url":"https://arxiv.org/pdf/2501.12764v1.pdf","comment":"Accepted by IROS 2024"},{"id":"http://arxiv.org/abs/2410.12345v3","updated":"2025-01-22T08:55:37Z","published":"2024-10-16T08:05:56Z","title":"A Data-driven Contact Estimation Method for Wheeled-Biped Robots","summary":" Contact estimation is a key ability for limbed robots, where making and\nbreaking contacts has a direct impact on state estimation and balance control.\nExisting approaches typically rely on gate-cycle priors or designated contact\nsensors. We design a contact estimator that is suitable for the emerging\nwheeled-biped robot types that do not have these features. To this end, we\npropose a Bayes filter in which update steps are learned from real-robot torque\nmeasurements while prediction steps rely on inertial measurements. We evaluate\nthis approach in extensive real-robot and simulation experiments. Our method\nachieves better performance while being considerably more sample efficient than\na comparable deep-learning baseline.\n","authors":["Ü. Bora Gökbakan","Frederike Dümbgen","Stéphane Caron"],"pdf_url":"https://arxiv.org/pdf/2410.12345v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10074v2","updated":"2025-01-22T08:36:33Z","published":"2025-01-17T09:46:27Z","title":"SpatialCoT: Advancing Spatial Reasoning through Coordinate Alignment and\n Chain-of-Thought for Embodied Task Planning","summary":" Spatial reasoning is an essential problem in embodied AI research. Efforts to\nenhance spatial reasoning abilities through supplementary spatial data and\nfine-tuning have proven limited and ineffective when addressing complex\nembodied tasks, largely due to their dependence on language-based outputs.\nWhile some approaches have introduced a point-based action space to mitigate\nthis issue, they fall short in managing more intricate tasks within complex\nenvironments. This deficiency arises from their failure to fully exploit the\ninherent thinking and reasoning capabilities that are fundamental strengths of\nVision-Language Models (VLMs). To address these limitations, we propose a novel\napproach named SpatialCoT, specifically designed to bolster the spatial\nreasoning capabilities of VLMs. Our approach comprises two stages: spatial\ncoordinate bi-directional alignment, which aligns vision-language inputs with\nspatial coordinates, and chain-of-thought spatial grounding, which harnesses\nthe reasoning capabilities of language models for advanced spatial reasoning.\nWe evaluate SpatialCoT on challenging navigation and manipulation tasks, both\nin simulation and real-world settings. Experimental results demonstrate that\nour method significantly outperforms previous state-of-the-art approaches in\nboth tasks.\n","authors":["Yuecheng Liu","Dafeng Chi","Shiguang Wu","Zhanguang Zhang","Yaochen Hu","Lingfeng Zhang","Yingxue Zhang","Shuang Wu","Tongtong Cao","Guowei Huang","Helong Huang","Guangjian Tian","Weichao Qiu","Xingyue Quan","Jianye Hao","Yuzheng Zhuang"],"pdf_url":"https://arxiv.org/pdf/2501.10074v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2501.00368v2","updated":"2025-01-22T07:49:36Z","published":"2024-12-31T09:44:18Z","title":"Design Optimizer for Soft Growing Robot Manipulators in\n Three-Dimensional Environments","summary":" Soft growing robots are novel devices that mimic plant-like growth for\nnavigation in cluttered or dangerous environments. Their ability to adapt to\nsurroundings, combined with advancements in actuation and manufacturing\ntechnologies, allows them to perform specialized manipulation tasks. This work\npresents an approach for design optimization of soft growing robots;\nspecifically, the three-dimensional extension of the optimizer designed for\nplanar manipulators. This tool is intended to be used by engineers and robot\nenthusiasts before manufacturing their robot: it suggests the optimal size of\nthe robot for solving a specific task. The design process models a\nmulti-objective optimization problem to refine a soft manipulator's kinematic\nchain. Thanks to the novel Rank Partitioning algorithm integrated into\nEvolutionary Computation (EC) algorithms, this method achieves high precision\nin reaching targets and is efficient in resource usage. Results show\nsignificantly high performance in solving three-dimensional tasks, whereas\ncomparative experiments indicate that the optimizer features robust output when\ntested with different EC algorithms, particularly genetic algorithms.\n","authors":["Ahmet Astar","Ozan Nurcan","Erk Demirel","Emir Ozen","Ozan Kutlar","Fabio Stroppa"],"pdf_url":"https://arxiv.org/pdf/2501.00368v2.pdf","comment":"17 pages, 10 figures"},{"id":"http://arxiv.org/abs/2501.12654v1","updated":"2025-01-22T05:33:05Z","published":"2025-01-22T05:33:05Z","title":"AnyNav: Visual Neuro-Symbolic Friction Learning for Off-road Navigation","summary":" Off-road navigation is essential for a wide range of applications in field\nrobotics such as planetary exploration and disaster response. However, it\nremains an unresolved challenge due to the unstructured environments and\ninherent complexity of terrain-vehicle interactions. Traditional physics-based\nmethods struggle to accurately model the nonlinear dynamics of these\ninteractions, while data-driven approaches often suffer from overfitting to\nspecific motion patterns, vehicle sizes, and types, limiting their\ngeneralizability. To overcome these challenges, we introduce a vision-based\nfriction estimation framework grounded in neuro-symbolic principles,\nintegrating neural networks for visual perception with symbolic reasoning for\nphysical modeling. This enables significantly improved generalization abilities\nthrough explicit physical reasoning incorporating the predicted friction.\nAdditionally, we develop a physics-informed planner that leverages the learned\nfriction coefficient to generate physically feasible and efficient paths, along\nwith corresponding speed profiles. We refer to our approach as AnyNav and\nevaluate it in both simulation and real-world experiments, demonstrating its\nutility and robustness across various off-road scenarios and multiple types of\nfour-wheeled vehicles. These results mark an important step toward developing\nneuro-symbolic spatial intelligence to reason about complex, unstructured\nenvironments and enable autonomous off-road navigation in challenging\nscenarios. Video demonstrations are available at https://sairlab.org/anynav/,\nwhere the source code will also be released.\n","authors":["Taimeng Fu","Zitong Zhan","Zhipeng Zhao","Shaoshu Su","Xiao Lin","Ehsan Tarkesh Esfahani","Karthik Dantu","Souma Chowdhury","Chen Wang"],"pdf_url":"https://arxiv.org/pdf/2501.12654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12594v1","updated":"2025-01-22T02:42:27Z","published":"2025-01-22T02:42:27Z","title":"A 3-Step Optimization Framework with Hybrid Models for a Humanoid\n Robot's Jump Motion","summary":" High dynamic jump motions are challenging tasks for humanoid robots to\nachieve environment adaptation and obstacle crossing. The trajectory\noptimization is a practical method to achieve high-dynamic and explosive\njumping. This paper proposes a 3-step trajectory optimization framework for\ngenerating a jump motion for a humanoid robot. To improve iteration speed and\nachieve ideal performance, the framework comprises three sub-optimizations. The\nfirst optimization incorporates momentum, inertia, and center of pressure\n(CoP), treating the robot as a static reaction momentum pendulum (SRMP) model\nto generate corresponding trajectories. The second optimization maps these\ntrajectories to joint space using effective Quadratic Programming (QP) solvers.\nFinally, the third optimization generates whole-body joint trajectories\nutilizing trajectories generated by previous parts. With the combined\nconsideration of momentum and inertia, the robot achieves agile forward jump\nmotions. A simulation and experiments (Fig. \\ref{Fig First page fig}) of\nforward jump with a distance of 1.0 m and 0.5 m height are presented in this\npaper, validating the applicability of the proposed framework.\n","authors":["Haoxiang Qi","Zhangguo Yu","Xuechao Chen","Yaliang Liu","Chuanku Yi","Chencheng Dong","Fei Meng","Qiang Huang"],"pdf_url":"https://arxiv.org/pdf/2501.12594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.13548v3","updated":"2025-01-22T02:30:41Z","published":"2024-12-18T06:49:46Z","title":"TelePreview: A User-Friendly Teleoperation System with Virtual Arm\n Assistance for Enhanced Effectiveness","summary":" Teleoperation provides an effective way to collect robot data, which is\ncrucial for learning from demonstrations. In this field, teleoperation faces\nseveral key challenges: user-friendliness for new users, safety assurance, and\ntransferability across different platforms. While collecting real robot\ndexterous manipulation data by teleoperation to train robots has shown\nimpressive results on diverse tasks, due to the morphological differences\nbetween human and robot hands, it is not only hard for new users to understand\nthe action mapping but also raises potential safety concerns during operation.\nTo address these limitations, we introduce TelePreview. This teleoperation\nsystem offers real-time visual feedback on robot actions based on human user\ninputs, with a total hardware cost of less than $1,000. TelePreview allows the\nuser to see a virtual robot that represents the outcome of the user's next\nmovement. By enabling flexible switching between command visualization and\nactual execution, this system helps new users learn how to demonstrate quickly\nand safely. We demonstrate that it outperforms other teleoperation systems\nacross five tasks, emphasize its ease of use, and highlight its straightforward\ndeployment across diverse robotic platforms. We release our code and a\ndeployment document on our website https://telepreview.github.io.\n","authors":["Jingxiang Guo","Jiayu Luo","Zhenyu Wei","Yiwen Hou","Zhixuan Xu","Xiaoyi Lin","Chongkai Gao","Lin Shao"],"pdf_url":"https://arxiv.org/pdf/2412.13548v3.pdf","comment":"In submission"},{"id":"http://arxiv.org/abs/2501.09905v2","updated":"2025-01-22T01:48:31Z","published":"2025-01-17T01:32:18Z","title":"SLIM: Sim-to-Real Legged Instructive Manipulation via Long-Horizon\n Visuomotor Learning","summary":" We present a low-cost legged mobile manipulation system that solves\nlong-horizon real-world tasks, trained by reinforcement learning purely in\nsimulation. This system is made possible by 1) a hierarchical design of a\nhigh-level policy for visual-mobile manipulation following instructions and a\nlow-level policy for quadruped movement and limb control, 2) a progressive\nexploration and learning approach that leverages privileged task decomposition\ninformation to train the teacher policy for long-horizon tasks, which will\nguide an imitation-based student policy for efficient training of the\nhigh-level visuomotor policy, and 3) a suite of techniques for minimizing\nsim-to-real gaps.\n In contrast to previous approaches that use high-end equipment, our system\ndemonstrates effective performance with more accessible hardware -\nspecifically, a Unitree Go1 quadruped, a WidowX250S arm, and a single\nwrist-mounted RGB camera - despite the increased challenges of sim-to-real\ntransfer. When fully trained in simulation, a single policy autonomously solves\nlong-horizon tasks such as search, move, grasp, and drop-into, achieving nearly\n80% success. This performance is comparable to that of expert human\nteleoperation on the same tasks but operates in a more efficient way, at 1.5\ntimes the speed of human expert. The sim-to-real transfer is fluid across\ndiverse indoor and outdoor scenes under varying lighting conditions. Finally,\nwe discuss the key techniques that enable the entire pipeline, including\nefficient RL training and sim-to-real, to work effectively for legged mobile\nmanipulation, and present their ablation results.\n","authors":["Haichao Zhang","Haonan Yu","Le Zhao","Andrew Choi","Qinxun Bai","Break Yang","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2501.09905v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14957v2","updated":"2025-01-22T22:15:13Z","published":"2024-10-19T03:08:10Z","title":"Offline-to-online Reinforcement Learning for Image-based Grasping with\n Scarce Demonstrations","summary":" Offline-to-online reinforcement learning (O2O RL) aims to obtain a\ncontinually improving policy as it interacts with the environment, while\nensuring the initial policy behaviour is satisficing. This satisficing\nbehaviour is necessary for robotic manipulation where random exploration can be\ncostly due to catastrophic failures and time. O2O RL is especially compelling\nwhen we can only obtain a scarce amount of (potentially suboptimal)\ndemonstrations$\\unicode{x2014}$a scenario where behavioural cloning (BC) is\nknown to suffer from distribution shift. Previous works have outlined the\nchallenges in applying O2O RL algorithms under the image-based environments. In\nthis work, we propose a novel O2O RL algorithm that can learn in a real-life\nimage-based robotic vacuum grasping task with a small number of demonstrations\nwhere BC fails majority of the time. The proposed algorithm replaces the target\nnetwork in off-policy actor-critic algorithms with a regularization technique\ninspired by neural tangent kernel. We demonstrate that the proposed algorithm\ncan reach above 90\\% success rate in under two hours of interaction time, with\nonly 50 human demonstrations, while BC and existing commonly-used RL algorithms\nfail to achieve similar performance.\n","authors":["Bryan Chan","Anson Leung","James Bergstra"],"pdf_url":"https://arxiv.org/pdf/2410.14957v2.pdf","comment":"In CoRL Workshop on Mastering Robot Manipulation in a World of\n Abundant Data 2024"},{"id":"http://arxiv.org/abs/2306.09600v2","updated":"2025-01-22T22:01:33Z","published":"2023-06-16T02:59:20Z","title":"From Novice to Skilled: RL-based Shared Autonomy Communicating with\n Pilots in UAV Multi-Task Missions","summary":" Multi-task missions for unmanned aerial vehicles (UAVs) involving inspection\nand landing tasks are challenging for novice pilots due to the difficulties\nassociated with depth perception and the control interface. We propose a shared\nautonomy system, alongside supplementary information displays, to assist pilots\nto successfully complete multi-task missions without any pilot training. Our\napproach comprises of three modules: (1) a perception module that encodes\nvisual information onto a latent representation, (2) a policy module that\naugments pilot's actions, and (3) an information augmentation module that\nprovides additional information to the pilot. The policy module is trained in\nsimulation with simulated users and transferred to the real world without\nmodification in a user study (n=29), alongside alternative supplementary\ninformation schemes including learnt red/green light feedback cues and an\naugmented reality display. The pilot's intent is unknown to the policy module\nand is inferred from the pilot's input and UAV's states. The assistant\nincreased task success rate for the landing and inspection tasks from [16.67% &\n54.29%] respectively to [95.59% & 96.22%]. With the assistant, inexperienced\npilots achieved similar performance to experienced pilots. Red/green light\nfeedback cues reduced the required time by 19.53% and trajectory length by\n17.86% for the inspection task, where participants rated it as their preferred\ncondition due to the intuitive interface and providing reassurance. This work\ndemonstrates that simple user models can train shared autonomy systems in\nsimulation, and transfer to physical tasks to estimate user intent and provide\neffective assistance and information to the pilot.\n","authors":["Kal Backman","Dana Kulić","Hoam Chung"],"pdf_url":"https://arxiv.org/pdf/2306.09600v2.pdf","comment":"37 pages, 11 figures, 6 tables. Accepted to ACM Transactions on\n Human-Robot Interaction (THRI)"},{"id":"http://arxiv.org/abs/2501.13233v1","updated":"2025-01-22T21:34:39Z","published":"2025-01-22T21:34:39Z","title":"\"See You Later, Alligator\": Impacts of Robot Small Talk on Task,\n Rapport, and Interaction Dynamics in Human-Robot Collaboration","summary":" Small talk can foster rapport building in human-human teamwork; yet how\nnon-anthropomorphic robots, such as collaborative manipulators commonly used in\nindustry, may capitalize on these social communications remains unclear. This\nwork investigates how robot-initiated small talk influences task performance,\nrapport, and interaction dynamics in human-robot collaboration. We developed an\nautonomous robot system that assists a human in an assembly task while\ninitiating and engaging in small talk. A user study ($N = 58$) was conducted in\nwhich participants worked with either a functional robot, which engaged in only\ntask-oriented speech, or a social robot, which also initiated small talk. Our\nstudy found that participants in the social condition reported significantly\nhigher levels of rapport with the robot. Moreover, all participants in the\nsocial condition responded to the robot's small talk attempts; 59% initiated\nquestions to the robot, and 73% engaged in lingering conversations after\nrequesting the final task item. Although active working times were similar\nacross conditions, participants in the social condition recorded longer task\ndurations than those in the functional condition. We discuss the design and\nimplications of robot small talk in shaping human-robot collaboration.\n","authors":["Kaitlynn Taylor Pineda","Ethan Brown","Chien-Ming Huang"],"pdf_url":"https://arxiv.org/pdf/2501.13233v1.pdf","comment":"8 pages, 4 figures, preprint for HRI25, the 20th edition of the\n IEEE/ACM International Conference on Human-Robot Interaction"},{"id":"http://arxiv.org/abs/2501.13203v1","updated":"2025-01-22T20:20:51Z","published":"2025-01-22T20:20:51Z","title":"Safe and Efficient Robot Action Planning in the Presence of Unconcerned\n Humans","summary":" This paper proposes a robot action planning scheme that provides an efficient\nand probabilistically safe plan for a robot interacting with an unconcerned\nhuman -- someone who is either unaware of the robot's presence or unwilling to\nengage in ensuring safety. The proposed scheme is predictive, meaning that the\nrobot is required to predict human actions over a finite future horizon; such\npredictions are often inaccurate in real-world scenarios. One possible approach\nto reduce the uncertainties is to provide the robot with the capability of\nreasoning about the human's awareness of potential dangers. This paper\ndiscusses that by using a binary variable, so-called danger awareness\ncoefficient, it is possible to differentiate between concerned and unconcerned\nhumans, and provides a learning algorithm to determine this coefficient by\nobserving human actions. Moreover, this paper argues how humans rely on\npredictions of other agents' future actions (including those of robots in\nhuman-robot interaction) in their decision-making. It also shows that ignoring\nthis aspect in predicting human's future actions can significantly degrade the\nefficiency of the interaction, causing agents to deviate from their optimal\npaths. The proposed robot action planning scheme is verified and validated via\nextensive simulation and experimental studies on a LoCoBot WidowX-250.\n","authors":["Mohsen Amiri","Mehdi Hosseinzadeh"],"pdf_url":"https://arxiv.org/pdf/2501.13203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13201v1","updated":"2025-01-22T20:09:32Z","published":"2025-01-22T20:09:32Z","title":"Polyhedral Collision Detection via Vertex Enumeration","summary":" Collision detection is a critical functionality for robotics. The degree to\nwhich objects collide cannot be represented as a continuously differentiable\nfunction for any shapes other than spheres. This paper proposes a framework for\nhandling collision detection between polyhedral shapes. We frame the signed\ndistance between two polyhedral bodies as the optimal value of a convex\noptimization, and consider constraining the signed distance in a bilevel\noptimization problem. To avoid relying on specialized bilevel solvers, our\nmethod exploits the fact that the signed distance is the minimal point of a\nconvex region related to the two bodies. Our method enumerates the values\nobtained at all extreme points of this region and lists them as constraints in\nthe higher-level problem. We compare our formulation to existing methods in\nterms of reliability and speed when solved using the same mixed complementarity\nproblem solver. We demonstrate that our approach more reliably solves difficult\ncollision detection problems with multiple obstacles than other methods, and is\nfaster than existing methods in some cases.\n","authors":["Andrew Cinar","Yue Zhao","Forrest Laine"],"pdf_url":"https://arxiv.org/pdf/2501.13201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13189v1","updated":"2025-01-22T19:40:04Z","published":"2025-01-22T19:40:04Z","title":"Map Prediction and Generative Entropy for Multi-Agent Exploration","summary":" Traditionally, autonomous reconnaissance applications have acted on explicit\nsets of historical observations. Aided by recent breakthroughs in generative\ntechnologies, this work enables robot teams to act beyond what is currently\nknown about the environment by inferring a distribution of reasonable\ninterpretations of the scene. We developed a map predictor that inpaints the\nunknown space in a multi-agent 2D occupancy map during an exploration mission.\nFrom a comparison of several inpainting methods, we found that a fine-tuned\nlatent diffusion inpainting model could provide rich and coherent\ninterpretations of simulated urban environments with relatively little\ncomputation time. By iteratively inferring interpretations of the scene\nthroughout an exploration run, we are able to identify areas that exhibit high\nuncertainty in the prediction, which we formalize with the concept of\ngenerative entropy. We prioritize tasks in regions of high generative entropy,\nhypothesizing that this will expedite convergence on an accurate predicted map\nof the scene. In our study we juxtapose this new paradigm of task ranking with\nthe state of the art, which ranks regions to explore by those which maximize\nexpected information recovery. We compare both of these methods in a simulated\nurban environment with three vehicles. Our results demonstrate that by using\nour new task ranking method, we can predict a correct scene significantly\nfaster than with a traditional information-guided method.\n","authors":["Alexander Spinos","Bradley Woosley","Justin Rokisky","Christopher Korpela","John G. Rogers III","Brian A. Bittner"],"pdf_url":"https://arxiv.org/pdf/2501.13189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04577v2","updated":"2025-01-22T19:28:38Z","published":"2025-01-08T15:47:04Z","title":"A 65 nm Bayesian Neural Network Accelerator with 360 fJ/Sample In-Word\n GRNG for AI Uncertainty Estimation","summary":" Uncertainty estimation is an indispensable capability for AI-enabled,\nsafety-critical applications, e.g. autonomous vehicles or medical diagnosis.\nBayesian neural networks (BNNs) use Bayesian statistics to provide both\nclassification predictions and uncertainty estimation, but they suffer from\nhigh computational overhead associated with random number generation and\nrepeated sample iterations. Furthermore, BNNs are not immediately amenable to\nacceleration through compute-in-memory architectures due to the frequent memory\nwrites necessary after each RNG operation. To address these challenges, we\npresent an ASIC that integrates 360 fJ/Sample Gaussian RNG directly into the\nSRAM memory words. This integration reduces RNG overhead and enables\nfully-parallel compute-in-memory operations for BNNs. The prototype chip\nachieves 5.12 GSa/s RNG throughput and 102 GOp/s neural network throughput\nwhile occupying 0.45 mm2, bringing AI uncertainty estimation to edge\ncomputation.\n","authors":["Zephan M. Enciso","Boyang Cheng","Likai Pei","Jianbo Liu","Steven Davis","Michael Niemier","Ningyuan Cao"],"pdf_url":"https://arxiv.org/pdf/2501.04577v2.pdf","comment":"7 pages, 12 figures"},{"id":"http://arxiv.org/abs/2501.13132v1","updated":"2025-01-22T02:41:36Z","published":"2025-01-22T02:41:36Z","title":"A Hierarchical Reinforcement Learning Framework for Multi-UAV Combat\n Using Leader-Follower Strategy","summary":" Multi-UAV air combat is a complex task involving multiple autonomous UAVs, an\nevolving field in both aerospace and artificial intelligence. This paper aims\nto enhance adversarial performance through collaborative strategies. Previous\napproaches predominantly discretize the action space into predefined actions,\nlimiting UAV maneuverability and complex strategy implementation. Others\nsimplify the problem to 1v1 combat, neglecting the cooperative dynamics among\nmultiple UAVs. To address the high-dimensional challenges inherent in\nsix-degree-of-freedom space and improve cooperation, we propose a hierarchical\nframework utilizing the Leader-Follower Multi-Agent Proximal Policy\nOptimization (LFMAPPO) strategy. Specifically, the framework is structured into\nthree levels. The top level conducts a macro-level assessment of the\nenvironment and guides execution policy. The middle level determines the angle\nof the desired action. The bottom level generates precise action commands for\nthe high-dimensional action space. Moreover, we optimize the state-value\nfunctions by assigning distinct roles with the leader-follower strategy to\ntrain the top-level policy, followers estimate the leader's utility, promoting\neffective cooperation among agents. Additionally, the incorporation of a target\nselector, aligned with the UAVs' posture, assesses the threat level of targets.\nFinally, simulation experiments validate the effectiveness of our proposed\nmethod.\n","authors":["Jinhui Pang","Jinglin He","Noureldin Mohamed Abdelaal Ahmed Mohamed","Changqing Lin","Zhihui Zhang","Xiaoshuai Hao"],"pdf_url":"https://arxiv.org/pdf/2501.13132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13973v1","updated":"2025-01-22T19:32:07Z","published":"2025-01-22T19:32:07Z","title":"A Spatio-temporal Graph Network Allowing Incomplete Trajectory Input for\n Pedestrian Trajectory Prediction","summary":" Pedestrian trajectory prediction is important in the research of mobile robot\nnavigation in environments with pedestrians. Most pedestrian trajectory\nprediction algorithms require the input historical trajectories to be complete.\nIf a pedestrian is unobservable in any frame in the past, then its historical\ntrajectory become incomplete, the algorithm will not predict its future\ntrajectory. To address this limitation, we propose the STGN-IT, a\nspatio-temporal graph network allowing incomplete trajectory input, which can\npredict the future trajectories of pedestrians with incomplete historical\ntrajectories. STGN-IT uses the spatio-temporal graph with an additional\nencoding method to represent the historical trajectories and observation states\nof pedestrians. Moreover, STGN-IT introduces static obstacles in the\nenvironment that may affect the future trajectories as nodes to further improve\nthe prediction accuracy. A clustering algorithm is also applied in the\nconstruction of spatio-temporal graphs. Experiments on public datasets show\nthat STGN-IT outperforms state of the art algorithms on these metrics.\n","authors":["Juncen Long","Gianluca Bardaro","Simone Mentasti","Matteo Matteucci"],"pdf_url":"https://arxiv.org/pdf/2501.13973v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.13107v1","updated":"2025-01-22T18:59:58Z","published":"2025-01-22T18:59:58Z","title":"Accelerate High-Quality Diffusion Models with Inner Loop Feedback","summary":" We propose Inner Loop Feedback (ILF), a novel approach to accelerate\ndiffusion models' inference. ILF trains a lightweight module to predict future\nfeatures in the denoising process by leveraging the outputs from a chosen\ndiffusion backbone block at a given time step. This approach exploits two key\nintuitions; (1) the outputs of a given block at adjacent time steps are\nsimilar, and (2) performing partial computations for a step imposes a lower\nburden on the model than skipping the step entirely. Our method is highly\nflexible, since we find that the feedback module itself can simply be a block\nfrom the diffusion backbone, with all settings copied. Its influence on the\ndiffusion forward can be tempered with a learnable scaling factor from zero\ninitialization. We train this module using distillation losses; however, unlike\nsome prior work where a full diffusion backbone serves as the student, our\nmodel freezes the backbone, training only the feedback module. While many\nefforts to optimize diffusion models focus on achieving acceptable image\nquality in extremely few steps (1-4 steps), our emphasis is on matching best\ncase results (typically achieved in 20 steps) while significantly reducing\nruntime. ILF achieves this balance effectively, demonstrating strong\nperformance for both class-to-image generation with diffusion transformer (DiT)\nand text-to-image generation with DiT-based PixArt-alpha and PixArt-sigma. The\nquality of ILF's 1.7x-1.8x speedups are confirmed by FID, CLIP score, CLIP\nImage Quality Assessment, ImageReward, and qualitative comparisons.\n","authors":["Matthew Gwilliam","Han Cai","Di Wu","Abhinav Shrivastava","Zhiyu Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.13107v1.pdf","comment":"submission currently under review; 20 pages, 17 figures, 6 tables"},{"id":"http://arxiv.org/abs/2501.13106v1","updated":"2025-01-22T18:59:46Z","published":"2025-01-22T18:59:46Z","title":"VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video\n Understanding","summary":" In this paper, we propose VideoLLaMA3, a more advanced multimodal foundation\nmodel for image and video understanding. The core design philosophy of\nVideoLLaMA3 is vision-centric. The meaning of \"vision-centric\" is two-fold: the\nvision-centric training paradigm and vision-centric framework design. The key\ninsight of our vision-centric training paradigm is that high-quality image-text\ndata is crucial for both image and video understanding. Instead of preparing\nmassive video-text datasets, we focus on constructing large-scale and\nhigh-quality image-text datasets. VideoLLaMA3 has four training stages: 1)\nvision-centric alignment stage, which warms up the vision encoder and\nprojector; 2) vision-language pretraining stage, which jointly tunes the vision\nencoder, projector, and LLM with large-scale image-text data covering multiple\ntypes (including scene images, documents, charts) as well as text-only data. 3)\nmulti-task fine-tuning stage, which incorporates image-text SFT data for\ndownstream tasks and video-text data to establish a foundation for video\nunderstanding. 4) video-centric fine-tuning, which further improves the model's\ncapability in video understanding. As for the framework design, to better\ncapture fine-grained details in images, the pretrained vision encoder is\nadapted to encode images of varying sizes into vision tokens with corresponding\nnumbers, rather than a fixed number of tokens. For video inputs, we reduce the\nnumber of vision tokens according to their similarity so that the\nrepresentation of videos will be more precise and compact. Benefit from\nvision-centric designs, VideoLLaMA3 achieves compelling performances in both\nimage and video understanding benchmarks.\n","authors":["Boqiang Zhang","Kehan Li","Zesen Cheng","Zhiqiang Hu","Yuqian Yuan","Guanzheng Chen","Sicong Leng","Yuming Jiang","Hang Zhang","Xin Li","Peng Jin","Wenqi Zhang","Fan Wang","Lidong Bing","Deli Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.13106v1.pdf","comment":"BZ, KL, ZC, ZH, YY, GC, SL, YJ, HZ, and XL contributed equally to\n this project. Code: https://github.com/DAMO-NLP-SG/VideoLLaMA3"},{"id":"http://arxiv.org/abs/2501.13104v1","updated":"2025-01-22T18:59:10Z","published":"2025-01-22T18:59:10Z","title":"Neural Radiance Fields for the Real World: A Survey","summary":" Neural Radiance Fields (NeRFs) have remodeled 3D scene representation since\nrelease. NeRFs can effectively reconstruct complex 3D scenes from 2D images,\nadvancing different fields and applications such as scene understanding, 3D\ncontent generation, and robotics. Despite significant research progress, a\nthorough review of recent innovations, applications, and challenges is lacking.\nThis survey compiles key theoretical advancements and alternative\nrepresentations and investigates emerging challenges. It further explores\napplications on reconstruction, highlights NeRFs' impact on computer vision and\nrobotics, and reviews essential datasets and toolkits. By identifying gaps in\nthe literature, this survey discusses open challenges and offers directions for\nfuture research.\n","authors":["Wenhui Xiao","Remi Chierchia","Rodrigo Santa Cruz","Xuesong Li","David Ahmedt-Aristizabal","Olivier Salvado","Clinton Fookes","Leo Lebrat"],"pdf_url":"https://arxiv.org/pdf/2501.13104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13094v1","updated":"2025-01-22T18:52:06Z","published":"2025-01-22T18:52:06Z","title":"Robust Representation Consistency Model via Contrastive Denoising","summary":" Robustness is essential for deep neural networks, especially in\nsecurity-sensitive applications. To this end, randomized smoothing provides\ntheoretical guarantees for certifying robustness against adversarial\nperturbations. Recently, diffusion models have been successfully employed for\nrandomized smoothing to purify noise-perturbed samples before making\npredictions with a standard classifier. While these methods excel at small\nperturbation radii, they struggle with larger perturbations and incur a\nsignificant computational overhead during inference compared to classical\nmethods. To address this, we reformulate the generative modeling task along the\ndiffusion trajectories in pixel space as a discriminative task in the latent\nspace. Specifically, we use instance discrimination to achieve consistent\nrepresentations along the trajectories by aligning temporally adjacent points.\nAfter fine-tuning based on the learned representations, our model enables\nimplicit denoising-then-classification via a single prediction, substantially\nreducing inference costs. We conduct extensive experiments on various datasets\nand achieve state-of-the-art performance with minimal computation budget during\ninference. For example, our method outperforms the certified accuracy of\ndiffusion-based methods on ImageNet across all perturbation radii by 5.3% on\naverage, with up to 11.6% at larger radii, while reducing inference costs by\n85$\\times$ on average. Codes are available at:\nhttps://github.com/jiachenlei/rRCM.\n","authors":["Jiachen Lei","Julius Berner","Jiongxiao Wang","Zhongzhu Chen","Zhongjia Ba","Kui Ren","Jun Zhu","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2501.13094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13087v1","updated":"2025-01-22T18:46:47Z","published":"2025-01-22T18:46:47Z","title":"Orchid: Image Latent Diffusion for Joint Appearance and Geometry\n Generation","summary":" Diffusion models are state-of-the-art for image generation. Trained on large\ndatasets, they capture expressive image priors that have been used for tasks\nlike inpainting, depth, and (surface) normal prediction. However, these models\nare typically trained for one specific task, e.g., a separate model for each of\ncolor, depth, and normal prediction. Such models do not leverage the intrinsic\ncorrelation between appearance and geometry, often leading to inconsistent\npredictions.\n In this paper, we propose using a novel image diffusion prior that jointly\nencodes appearance and geometry. We introduce a diffusion model Orchid,\ncomprising a Variational Autoencoder (VAE) to encode color, depth, and surface\nnormals to a latent space, and a Latent Diffusion Model (LDM) for generating\nthese joint latents. Orchid directly generates photo-realistic color images,\nrelative depth, and surface normals from user-provided text, and can be used to\ncreate image-aligned partial 3D scenes seamlessly. It can also perform\nimage-conditioned tasks like joint monocular depth and normal prediction and is\ncompetitive in accuracy to state-of-the-art methods designed for those tasks\nalone. Lastly, our model learns a joint prior that can be used zero-shot as a\nregularizer for many inverse problems that entangle appearance and geometry.\nFor example, we demonstrate its effectiveness in color-depth-normal inpainting,\nshowcasing its applicability to problems in 3D generation from sparse views.\n","authors":["Akshay Krishnan","Xinchen Yan","Vincent Casser","Abhijit Kundu"],"pdf_url":"https://arxiv.org/pdf/2501.13087v1.pdf","comment":"Project webpage: https://orchid3d.github.io"},{"id":"http://arxiv.org/abs/2501.13073v1","updated":"2025-01-22T18:35:57Z","published":"2025-01-22T18:35:57Z","title":"CHaRNet: Conditioned Heatmap Regression for Robust Dental Landmark\n Localization","summary":" Identifying anatomical landmarks in 3D dental models is crucial for\northodontic treatment. Manually placing these key points is complex,\ntime-consuming, and requires expert knowledge. While some machine learning\nmethods have been proposed for automatic tooth landmark detection in 3D\nIntraoral Scans (IOS), research remains limited, with no fully end-to-end\napproaches that avoid teeth segmentation.\n We propose CHaRNet (Conditioned Heatmap Regression Network), the first\nend-to-end deep learning method for tooth landmark detection in 3D IOS. Unlike\ntraditional two-stage methods that segment teeth before detecting landmarks,\nCHaRNet directly detects landmarks on the input point cloud. It consists of\nfour key modules: (1) a point cloud encoder, (2) a point cloud decoder with a\nheatmap regression head, (3) a teeth presence classification head, and (4) the\ninnovative Conditioned Heatmap Regression (CHaR) module. The CHaR module\nrefines landmark regression by leveraging teeth presence classification,\nenabling dynamic adaptation to cases with missing teeth and improving accuracy\nin complex dental models.\n We evaluate CHaRNet using five point cloud learning algorithms to validate\nthe effectiveness of the CHaR module and test it on a clinical dataset of\n$1,214$ annotated 3D dental models. Both the dataset and code will be publicly\nreleased to address the lack of open datasets in orthodontics, promote\nbenchmarking, and inspire new research.\n CHaRNet achieves a Mean Euclidean Distance Error (MEDE) of 1.28 mm and a Mean\nSuccess Ratio (MSR) of 82.40\\%, demonstrating robust performance. Notably, it\nexcels in handling irregular dental geometries, such as models with missing\nteeth. This end-to-end approach streamlines orthodontic workflows, improves 3D\nIOS analysis precision, and facilitates efficient computer-assisted treatment\nplanning.\n","authors":["José Rodríguez-Ortega","Siham Tabik"],"pdf_url":"https://arxiv.org/pdf/2501.13073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13071v1","updated":"2025-01-22T18:32:23Z","published":"2025-01-22T18:32:23Z","title":"Robust Body Composition Analysis by Generating 3D CT Volumes from\n Limited 2D Slices","summary":" Body composition analysis provides valuable insights into aging, disease\nprogression, and overall health conditions. Due to concerns of radiation\nexposure, two-dimensional (2D) single-slice computed tomography (CT) imaging\nhas been used repeatedly for body composition analysis. However, this approach\nintroduces significant spatial variability that can impact the accuracy and\nrobustness of the analysis. To mitigate this issue and facilitate body\ncomposition analysis, this paper presents a novel method to generate 3D CT\nvolumes from limited number of 2D slices using a latent diffusion model (LDM).\nOur approach first maps 2D slices into a latent representation space using a\nvariational autoencoder. An LDM is then trained to capture the 3D context of a\nstack of these latent representations. To accurately interpolate\nintermediateslices and construct a full 3D volume, we utilize body part\nregression to determine the spatial location and distance between the acquired\nslices. Experiments on both in-house and public 3D abdominal CT datasets\ndemonstrate that the proposed method significantly enhances body composition\nanalysis compared to traditional 2D-based analysis, with a reduced error rate\nfrom 23.3% to 15.2%.\n","authors":["Lianrui Zuo","Xin Yu","Dingjie Su","Kaiwen Xu","Aravind R. Krishnan","Yihao Liu","Shunxing Bao","Fabien Maldonado","Luigi Ferrucci","Bennett A. Landman"],"pdf_url":"https://arxiv.org/pdf/2501.13071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13068v1","updated":"2025-01-22T18:28:18Z","published":"2025-01-22T18:28:18Z","title":"Beyond the Lungs: Extending the Field of View in Chest CT with Latent\n Diffusion Models","summary":" The interconnection between the human lungs and other organs, such as the\nliver and kidneys, is crucial for understanding the underlying risks and\neffects of lung diseases and improving patient care. However, most research\nchest CT imaging is focused solely on the lungs due to considerations of cost\nand radiation dose. This restricted field of view (FOV) in the acquired images\nposes challenges to comprehensive analysis and hinders the ability to gain\ninsights into the impact of lung diseases on other organs. To address this, we\npropose SCOPE (Spatial Coverage Optimization with Prior Encoding), a novel\napproach to capture the inter-organ relationships from CT images and extend the\nFOV of chest CT images. Our approach first trains a variational autoencoder\n(VAE) to encode 2D axial CT slices individually, then stacks the latent\nrepresentations of the VAE to form a 3D context for training a latent diffusion\nmodel. Once trained, our approach extends the FOV of CT images in the\nz-direction by generating new axial slices in a zero-shot manner. We evaluated\nour approach on the National Lung Screening Trial (NLST) dataset, and results\nsuggest that it effectively extends the FOV to include the liver and kidneys,\nwhich are not completely covered in the original NLST data acquisition.\nQuantitative results on a held-out whole-body dataset demonstrate that the\ngenerated slices exhibit high fidelity with acquired data, achieving an SSIM of\n0.81.\n","authors":["Lianrui Zuo","Kaiwen Xu","Dingjie Su","Xin Yu","Aravind R. Krishnan","Yihao Liu","Shunxing Bao","Thomas Li","Kim L. Sandler","Fabien Maldonado","Bennett A. Landman"],"pdf_url":"https://arxiv.org/pdf/2501.13068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02441v3","updated":"2025-01-22T18:23:26Z","published":"2024-11-02T13:03:44Z","title":"Cross-D Conv: Cross-Dimensional Transferable Knowledge Base via Fourier\n Shifting Operation","summary":" In biomedical imaging analysis, the dichotomy between 2D and 3D data presents\na significant challenge. While 3D volumes offer superior real-world\napplicability, they are less available for each modality and not easy to train\nin large scale, whereas 2D samples are abundant but less comprehensive. This\npaper introduces \\texttt{Cross-D Conv} operation, a novel approach that bridges\nthe dimensional gap by learning the phase shifting in the Fourier domain. Our\nmethod enables seamless weight transfer between 2D and 3D convolution\noperations, effectively facilitating cross-dimensional learning. The proposed\narchitecture leverages the abundance of 2D training data to enhance 3D model\nperformance, offering a practical solution to the multimodal data scarcity\nchallenge in 3D medical model pretraining. Experimental validation on the\nRadImagenet (2D) and multimodal volumetric sets demonstrates that our approach\nachieves comparable or superior performance in feature quality assessment. The\nenhanced convolution operation presents new opportunities for developing\nefficient classification and segmentation models in medical imaging. This work\nrepresents an advancement in cross-dimensional and multimodal medical image\nanalysis, offering a robust framework for utilizing 2D priors in 3D model\npretraining while maintaining computational efficiency of 2D training.\n","authors":["Mehmet Can Yavuz","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2411.02441v3.pdf","comment":"Accepted for ISBI25; Codes&Weights:\n https://github.com/convergedmachine/Cross-D-Conv"},{"id":"http://arxiv.org/abs/2501.13066v1","updated":"2025-01-22T18:21:55Z","published":"2025-01-22T18:21:55Z","title":"SMART-Vision: Survey of Modern Action Recognition Techniques in Vision","summary":" Human Action Recognition (HAR) is a challenging domain in computer vision,\ninvolving recognizing complex patterns by analyzing the spatiotemporal dynamics\nof individuals' movements in videos. These patterns arise in sequential data,\nsuch as video frames, which are often essential to accurately distinguish\nactions that would be ambiguous in a single image. HAR has garnered\nconsiderable interest due to its broad applicability, ranging from robotics and\nsurveillance systems to sports motion analysis, healthcare, and the burgeoning\nfield of autonomous vehicles. While several taxonomies have been proposed to\ncategorize HAR approaches in surveys, they often overlook hybrid methodologies\nand fail to demonstrate how different models incorporate various architectures\nand modalities. In this comprehensive survey, we present the novel SMART-Vision\ntaxonomy, which illustrates how innovations in deep learning for HAR complement\none another, leading to hybrid approaches beyond traditional categories. Our\nsurvey provides a clear roadmap from foundational HAR works to current\nstate-of-the-art systems, highlighting emerging research directions and\naddressing unresolved challenges in discussion sections for architectures\nwithin the HAR domain. We provide details of the research datasets that various\napproaches used to measure and compare goodness HAR approaches. We also explore\nthe rapidly emerging field of Open-HAR systems, which challenges HAR systems by\npresenting samples from unknown, novel classes during test time.\n","authors":["Ali K. AlShami","Ryan Rabinowitz","Khang Lam","Yousra Shleibik","Melkamu Mersha","Terrance Boult","Jugal Kalita"],"pdf_url":"https://arxiv.org/pdf/2501.13066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03153v2","updated":"2025-01-22T18:21:13Z","published":"2024-06-09T17:42:09Z","title":"An Efficient Framework for Crediting Data Contributors of Diffusion\n Models","summary":" As diffusion models are deployed in real-world settings, and their\nperformance is driven by training data, appraising the contribution of data\ncontributors is crucial to creating incentives for sharing quality data and to\nimplementing policies for data compensation. Depending on the use case, model\nperformance corresponds to various global properties of the distribution\nlearned by a diffusion model (e.g., overall aesthetic quality). Hence, here we\naddress the problem of attributing global properties of diffusion models to\ndata contributors. The Shapley value provides a principled approach to\nvaluation by uniquely satisfying game-theoretic axioms of fairness. However,\nestimating Shapley values for diffusion models is computationally impractical\nbecause it requires retraining on many training data subsets corresponding to\ndifferent contributors and rerunning inference. We introduce a method to\nefficiently retrain and rerun inference for Shapley value estimation, by\nleveraging model pruning and fine-tuning. We evaluate the utility of our method\nwith three use cases: (i) image quality for a DDPM trained on a CIFAR dataset,\n(ii) demographic diversity for an LDM trained on CelebA-HQ, and (iii) aesthetic\nquality for a Stable Diffusion model LoRA-finetuned on Post-Impressionist\nartworks. Our results empirically demonstrate that our framework can identify\nimportant data contributors across models' global properties, outperforming\nexisting attribution methods for diffusion models.\n","authors":["Chris Lin","Mingyu Lu","Chanwoo Kim","Su-In Lee"],"pdf_url":"https://arxiv.org/pdf/2407.03153v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13058v1","updated":"2025-01-22T18:10:20Z","published":"2025-01-22T18:10:20Z","title":"A polynomial formula for the perspective four points problem","summary":" We present a fast and accurate solution to the perspective n-points problem,\nby way of a new approach to the n=4 case. Our solution hinges on a novel\nseparation of variables: given four 3D points and four corresponding 2D points\non the camera canvas, we start by finding another set of 3D points, sitting on\nthe rays connecting the camera to the 2D canvas points, so that the six\npair-wise distances between these 3D points are as close as possible to the six\ndistances between the original 3D points. This step reduces the perspective\nproblem to an absolute orientation problem (which has a solution via explicit\nformula). To solve the first problem we set coordinates which are as\norientation-free as possible: on the 3D points side our coordinates are the\nsquared distances between the points. On the 2D canvas-points side our\ncoordinates are the dot products of the points after rotating one of them to\nsit on the optical axis. We then derive the solution with the help of a\ncomputer algebra system.\n","authors":["David Lehavi","Brian Osserman"],"pdf_url":"https://arxiv.org/pdf/2501.13058v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2501.13054v1","updated":"2025-01-22T18:06:00Z","published":"2025-01-22T18:06:00Z","title":"STMDNet: A Lightweight Directional Framework for Motion Pattern\n Recognition of Tiny Targets","summary":" Recognizing motions of tiny targets - only few dozen pixels - in cluttered\nbackgrounds remains a fundamental challenge when standard feature-based or deep\nlearning methods fail under scarce visual cues. We propose STMDNet, a\nmodel-based computational framework to Recognize motions of tiny targets at\nvariable velocities under low-sampling frequency scenarios. STMDNet designs a\nnovel dual-dynamics-and-correlation mechanism, harnessing ipsilateral\nexcitation to integrate target cues and leakage-enhancing-type contralateral\ninhibition to suppress large-object and background motion interference.\nMoreover, we develop the first collaborative directional encoding-decoding\nstrategy that determines the motion direction from only one correlation per\nspatial location, cutting computational costs to one-eighth of prior methods.\nFurther, simply substituting the backbone of a strong STMD model with STMDNet\nraises AUC by 24%, yielding an enhanced STMDNet-F. Evaluations on real-world\nlow sampling frequency datasets show state-of-the-art results, surpassing the\ndeep learning baseline. Across diverse speeds, STMDNet-F improves mF1 by 19%,\n16%, and 8% at 240Hz, 120Hz, and 60Hz, respectively, while STMDNet achieves 87\nFPS on a single CPU thread. These advances highlight STMDNet as a\nnext-generation backbone for tiny target motion pattern recognition and\nunderscore its broader potential to revitalize model-based visual approaches in\nmotion detection.\n","authors":["Mingshuo Xu","Hao Luan","Zhou Daniel Hao","Jigen Peng","Shigang Yue"],"pdf_url":"https://arxiv.org/pdf/2501.13054v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.13045v1","updated":"2025-01-22T17:52:45Z","published":"2025-01-22T17:52:45Z","title":"Sketch and Patch: Efficient 3D Gaussian Representation for Man-Made\n Scenes","summary":" 3D Gaussian Splatting (3DGS) has emerged as a promising representation for\nphotorealistic rendering of 3D scenes. However, its high storage requirements\npose significant challenges for practical applications. We observe that\nGaussians exhibit distinct roles and characteristics that are analogous to\ntraditional artistic techniques -- Like how artists first sketch outlines\nbefore filling in broader areas with color, some Gaussians capture\nhigh-frequency features like edges and contours; While other Gaussians\nrepresent broader, smoother regions, that are analogous to broader brush\nstrokes that add volume and depth to a painting. Based on this observation, we\npropose a novel hybrid representation that categorizes Gaussians into (i)\nSketch Gaussians, which define scene boundaries, and (ii) Patch Gaussians,\nwhich cover smooth regions. Sketch Gaussians are efficiently encoded using\nparametric models, leveraging their geometric coherence, while Patch Gaussians\nundergo optimized pruning, retraining, and vector quantization to maintain\nvolumetric consistency and storage efficiency. Our comprehensive evaluation\nacross diverse indoor and outdoor scenes demonstrates that this structure-aware\napproach achieves up to 32.62% improvement in PSNR, 19.12% in SSIM, and 45.41%\nin LPIPS at equivalent model sizes, and correspondingly, for an indoor scene,\nour model maintains the visual quality with 2.3% of the original model size.\n","authors":["Yuang Shi","Simone Gasparini","Géraldine Morin","Chenggang Yang","Wei Tsang Ooi"],"pdf_url":"https://arxiv.org/pdf/2501.13045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16772v2","updated":"2025-01-22T17:42:29Z","published":"2024-07-23T18:10:43Z","title":"VisMin: Visual Minimal-Change Understanding","summary":" Fine-grained understanding of objects, attributes, and relationships between\nobjects is crucial for visual-language models (VLMs). Existing benchmarks\nprimarily focus on evaluating VLMs' capability to distinguish between two very\nsimilar captions given an image. In this paper, we introduce a new, challenging\nbenchmark termed Visual Minimal-Change Understanding (VisMin), which requires\nmodels to predict the correct image-caption match given two images and two\ncaptions. The image pair and caption pair contain minimal changes, i.e., only\none aspect changes at a time from among the following: object, attribute,\ncount, and spatial relation. These changes test the models' understanding of\nobjects, attributes (such as color, material, shape), counts, and spatial\nrelationships between objects. We built an automatic framework using large\nlanguage models and diffusion models, followed by a rigorous 4-step\nverification process by human annotators. Empirical experiments reveal that\ncurrent VLMs exhibit notable deficiencies in understanding spatial\nrelationships and counting abilities. We also generate a large-scale training\ndataset to finetune CLIP and Idefics2, showing significant improvements in\nfine-grained understanding across benchmarks and in CLIP's general image-text\nalignment. We release all resources, including the benchmark, training data,\nand finetuned model checkpoints, at https://vismin.net/.\n","authors":["Rabiul Awal","Saba Ahmadi","Le Zhang","Aishwarya Agrawal"],"pdf_url":"https://arxiv.org/pdf/2407.16772v2.pdf","comment":"Accepted at NeurIPS 2024. Project URL at https://vismin.net/"},{"id":"http://arxiv.org/abs/2501.12060v2","updated":"2025-01-22T17:24:38Z","published":"2025-01-21T11:30:51Z","title":"GSVC: Efficient Video Representation and Compression Through 2D Gaussian\n Splatting","summary":" 3D Gaussian splats have emerged as a revolutionary, effective, learned\nrepresentation for static 3D scenes. In this work, we explore using 2D Gaussian\nsplats as a new primitive for representing videos. We propose GSVC, an approach\nto learning a set of 2D Gaussian splats that can effectively represent and\ncompress video frames. GSVC incorporates the following techniques: (i) To\nexploit temporal redundancy among adjacent frames, which can speed up training\nand improve the compression efficiency, we predict the Gaussian splats of a\nframe based on its previous frame; (ii) To control the trade-offs between file\nsize and quality, we remove Gaussian splats with low contribution to the video\nquality; (iii) To capture dynamics in videos, we randomly add Gaussian splats\nto fit content with large motion or newly-appeared objects; (iv) To handle\nsignificant changes in the scene, we detect key frames based on loss\ndifferences during the learning process. Experiment results show that GSVC\nachieves good rate-distortion trade-offs, comparable to state-of-the-art video\ncodecs such as AV1 and VVC, and a rendering speed of 1500 fps for a 1920x1080\nvideo.\n","authors":["Longan Wang","Yuang Shi","Wei Tsang Ooi"],"pdf_url":"https://arxiv.org/pdf/2501.12060v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.16086v2","updated":"2025-01-22T17:18:15Z","published":"2024-12-20T17:33:50Z","title":"Towards Interpretable Radiology Report Generation via Concept\n Bottlenecks using a Multi-Agentic RAG","summary":" Deep learning has advanced medical image classification, but interpretability\nchallenges hinder its clinical adoption. This study enhances interpretability\nin Chest X-ray (CXR) classification by using concept bottleneck models (CBMs)\nand a multi-agent Retrieval-Augmented Generation (RAG) system for report\ngeneration. By modeling relationships between visual features and clinical\nconcepts, we create interpretable concept vectors that guide a multi-agent RAG\nsystem to generate radiology reports, enhancing clinical relevance,\nexplainability, and transparency. Evaluation of the generated reports using an\nLLM-as-a-judge confirmed the interpretability and clinical utility of our\nmodel's outputs. On the COVID-QU dataset, our model achieved 81% classification\naccuracy and demonstrated robust report generation performance, with five key\nmetrics ranging between 84% and 90%. This interpretable multi-agent framework\nbridges the gap between high-performance AI and the explainability required for\nreliable AI-driven CXR analysis in clinical settings. Our code is available at\nhttps://github.com/tifat58/IRR-with-CBM-RAG.git.\n","authors":["Hasan Md Tusfiqur Alam","Devansh Srivastav","Md Abdul Kadir","Daniel Sonntag"],"pdf_url":"https://arxiv.org/pdf/2412.16086v2.pdf","comment":"Accepted in the 47th European Conference for Information Retrieval\n (ECIR) 2025"},{"id":"http://arxiv.org/abs/2501.13010v1","updated":"2025-01-22T16:52:20Z","published":"2025-01-22T16:52:20Z","title":"Learning accurate rigid registration for longitudinal brain MRI from\n synthetic data","summary":" Rigid registration aims to determine the translations and rotations necessary\nto align features in a pair of images. While recent machine learning methods\nhave become state-of-the-art for linear and deformable registration across\nsubjects, they have demonstrated limitations when applied to longitudinal\n(within-subject) registration, where achieving precise alignment is critical.\nBuilding on an existing framework for anatomy-aware, acquisition-agnostic\naffine registration, we propose a model optimized for longitudinal, rigid brain\nregistration. By training the model with synthetic within-subject pairs\naugmented with rigid and subtle nonlinear transforms, the model estimates more\naccurate rigid transforms than previous cross-subject networks and performs\nrobustly on longitudinal registration pairs within and across magnetic\nresonance imaging (MRI) contrasts.\n","authors":["Jingru Fu","Adrian V. Dalca","Bruce Fischl","Rodrigo Moreno","Malte Hoffmann"],"pdf_url":"https://arxiv.org/pdf/2501.13010v1.pdf","comment":"5 pages, 4 figures, 1 table, rigid image registration, deep learning,\n longitudinal analysis, neuroimaging, accepted by the IEEE International\n Symposium on Biomedical Imaging"},{"id":"http://arxiv.org/abs/2501.13009v1","updated":"2025-01-22T16:50:58Z","published":"2025-01-22T16:50:58Z","title":"Deep Learning-Based Image Recovery and Pose Estimation for Resident\n Space Objects","summary":" As the density of spacecraft in Earth's orbit increases, their recognition,\npose and trajectory identification becomes crucial for averting potential\ncollisions and executing debris removal operations. However, training models\nable to identify a spacecraft and its pose presents a significant challenge due\nto a lack of available image data for model training. This paper puts forth an\ninnovative framework for generating realistic synthetic datasets of Resident\nSpace Object (RSO) imagery. Using the International Space Station (ISS) as a\ntest case, it goes on to combine image regression with image restoration\nmethodologies to estimate pose from blurred images. An analysis of the proposed\nimage recovery and regression techniques was undertaken, providing insights\ninto the performance, potential enhancements and limitations when applied to\nreal imagery of RSOs. The image recovery approach investigated involves first\napplying image deconvolution using an effective point spread function, followed\nby detail object extraction with a U-Net. Interestingly, using only U-Net for\nimage reconstruction the best pose performance was attained, reducing the\naverage Mean Squared Error in image recovery by 97.28% and the average angular\nerror by 71.9%. The successful application of U-Net image restoration combined\nwith the Resnet50 regression network for pose estimation of the International\nSpace Station demonstrates the value of a diverse set of evaluation tools for\neffective solutions to real-world problems such as the analysis of distant\nobjects in Earth's orbit.\n","authors":["Louis Aberdeen","Mark Hansen","Melvyn L. Smith","Lyndon Smith"],"pdf_url":"https://arxiv.org/pdf/2501.13009v1.pdf","comment":"10 pages, 13 figures"},{"id":"http://arxiv.org/abs/2501.12981v1","updated":"2025-01-22T16:10:42Z","published":"2025-01-22T16:10:42Z","title":"UniUIR: Considering Underwater Image Restoration as An All-in-One\n Learner","summary":" Existing underwater image restoration (UIR) methods generally only handle\ncolor distortion or jointly address color and haze issues, but they often\noverlook the more complex degradations that can occur in underwater scenes. To\naddress this limitation, we propose a Universal Underwater Image Restoration\nmethod, termed as UniUIR, considering the complex scenario of real-world\nunderwater mixed distortions as an all-in-one manner. To decouple\ndegradation-specific issues and explore the inter-correlations among various\ndegradations in UIR task, we designed the Mamba Mixture-of-Experts module. This\nmodule enables each expert to identify distinct types of degradation and\ncollaboratively extract task-specific priors while maintaining global feature\nrepresentation based on linear complexity. Building upon this foundation, to\nenhance degradation representation and address the task conflicts that arise\nwhen handling multiple types of degradation, we introduce the spatial-frequency\nprior generator. This module extracts degradation prior information in both\nspatial and frequency domains, and adaptively selects the most appropriate\ntask-specific prompts based on image content, thereby improving the accuracy of\nimage restoration. Finally, to more effectively address complex,\nregion-dependent distortions in UIR task, we incorporate depth information\nderived from a large-scale pre-trained depth prediction model, thereby enabling\nthe network to perceive and leverage depth variations across different image\nregions to handle localized degradation. Extensive experiments demonstrate that\nUniUIR can produce more attractive results across qualitative and quantitative\ncomparisons, and shows strong generalization than state-of-the-art methods.\n","authors":["Xu Zhang","Huan Zhang","Guoli Wang","Qian Zhang","Lefei Zhang","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2501.12981v1.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2501.12976v1","updated":"2025-01-22T16:02:06Z","published":"2025-01-22T16:02:06Z","title":"LiT: Delving into a Simplified Linear Diffusion Transformer for Image\n Generation","summary":" In commonly used sub-quadratic complexity modules, linear attention benefits\nfrom simplicity and high parallelism, making it promising for image synthesis\ntasks. However, the architectural design and learning strategy for linear\nattention remain underexplored in this field. In this paper, we offer a suite\nof ready-to-use solutions for efficient linear diffusion Transformers. Our core\ncontributions include: (1) Simplified Linear Attention using few heads,\nobserving the free-lunch effect of performance without latency increase. (2)\nWeight inheritance from a fully pre-trained diffusion Transformer: initializing\nlinear Transformer using pre-trained diffusion Transformer and loading all\nparameters except for those related to linear attention. (3) Hybrid knowledge\ndistillation objective: using a pre-trained diffusion Transformer to help the\ntraining of the student linear Transformer, supervising not only the predicted\nnoise but also the variance of the reverse diffusion process. These guidelines\nlead to our proposed Linear Diffusion Transformer (LiT), an efficient\ntext-to-image Transformer that can be deployed offline on a laptop. Experiments\nshow that in class-conditional 256*256 and 512*512 ImageNet benchmark LiT\nachieves highly competitive FID while reducing training steps by 80% and 77%\ncompared to DiT. LiT also rivals methods based on Mamba or Gated Linear\nAttention. Besides, for text-to-image generation, LiT allows for the rapid\nsynthesis of up to 1K resolution photorealistic images. Project page:\nhttps://techmonsterwang.github.io/LiT/.\n","authors":["Jiahao Wang","Ning Kang","Lewei Yao","Mengzhao Chen","Chengyue Wu","Songyang Zhang","Shuchen Xue","Yong Liu","Taiqiang Wu","Xihui Liu","Kaipeng Zhang","Shifeng Zhang","Wenqi Shao","Zhenguo Li","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2501.12976v1.pdf","comment":"21 pages, 12 figures"},{"id":"http://arxiv.org/abs/2305.17349v4","updated":"2025-01-22T16:01:04Z","published":"2023-05-27T03:05:07Z","title":"Condition-Invariant Semantic Segmentation","summary":" Adaptation of semantic segmentation networks to different visual conditions\nis vital for robust perception in autonomous cars and robots. However, previous\nwork has shown that most feature-level adaptation methods, which employ\nadversarial training and are validated on synthetic-to-real adaptation, provide\nmarginal gains in condition-level adaptation, being outperformed by simple\npixel-level adaptation via stylization. Motivated by these findings, we propose\nto leverage stylization in performing feature-level adaptation by aligning the\ninternal network features extracted by the encoder of the network from the\noriginal and the stylized view of each input image with a novel feature\ninvariance loss. In this way, we encourage the encoder to extract features that\nare already invariant to the style of the input, allowing the decoder to focus\non parsing these features and not on further abstracting from the specific\nstyle of the input. We implement our method, named Condition-Invariant Semantic\nSegmentation (CISS), on the current state-of-the-art domain adaptation\narchitecture and achieve outstanding results on condition-level adaptation. In\nparticular, CISS sets the new state of the art in the popular\ndaytime-to-nighttime Cityscapes$\\to$Dark Zurich benchmark. Furthermore, our\nmethod achieves the second-best performance on the normal-to-adverse\nCityscapes$\\to$ACDC benchmark. CISS is shown to generalize well to domains\nunseen during training, such as BDD100K-night and ACDC-night. Code is publicly\navailable at https://github.com/SysCV/CISS .\n","authors":["Christos Sakaridis","David Bruggemann","Fisher Yu","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2305.17349v4.pdf","comment":"IEEE T-PAMI 2025"},{"id":"http://arxiv.org/abs/2501.12974v1","updated":"2025-01-22T15:58:11Z","published":"2025-01-22T15:58:11Z","title":"MorphoSkel3D: Morphological Skeletonization of 3D Point Clouds for\n Informed Sampling in Object Classification and Retrieval","summary":" Point clouds are a set of data points in space to represent the 3D geometry\nof objects. A fundamental step in the processing is to identify a subset of\npoints to represent the shape. While traditional sampling methods often ignore\nto incorporate geometrical information, recent developments in learning-based\nsampling models have achieved significant levels of performance. With the\nintegration of geometrical priors, the ability to learn and preserve the\nunderlying structure can be enhanced when sampling. To shed light into the\nshape, a qualitative skeleton serves as an effective descriptor to guide\nsampling for both local and global geometries. In this paper, we introduce\nMorphoSkel3D as a new technique based on morphology to facilitate an efficient\nskeletonization of shapes. With its low computational cost, MorphoSkel3D is a\nunique, rule-based algorithm to benchmark its quality and performance on two\nlarge datasets, ModelNet and ShapeNet, under different sampling ratios. The\nresults show that training with MorphoSkel3D leads to an informed and more\naccurate sampling in the practical application of object classification and\npoint cloud retrieval.\n","authors":["Pierre Onghena","Santiago Velasco-Forero","Beatriz Marcotegui"],"pdf_url":"https://arxiv.org/pdf/2501.12974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19534v2","updated":"2025-01-22T15:37:39Z","published":"2024-03-28T16:07:55Z","title":"Locate, Assign, Refine: Taming Customized Promptable Image Inpainting","summary":" Prior studies have made significant progress in image inpainting guided by\neither text description or subject image. However, the research on inpainting\nwith flexible guidance or control, i.e., text-only, image-only, and their\ncombination, is still in the early stage. Therefore, in this paper, we\nintroduce the multimodal promptable image inpainting project: a new task model,\nand data for taming customized image inpainting. We propose LAR-Gen, a novel\napproach for image inpainting that enables seamless inpainting of specific\nregion in images corresponding to the mask prompt, incorporating both the text\nprompt and image prompt. Our LAR-Gen adopts a coarse-to-fine manner to ensure\nthe context consistency of source image, subject identity consistency, local\nsemantic consistency to the text description, and smoothness consistency. It\nconsists of three mechanisms: (i) Locate mechanism: concatenating the noise\nwith masked scene image to achieve precise regional editing, (ii) Assign\nmechanism: employing decoupled cross-attention mechanism to accommodate\nmulti-modal guidance, and (iii) Refine mechanism: using a novel RefineNet to\nsupplement subject details. Additionally, to address the issue of scarce\ntraining data, we introduce a novel data engine to automatically extract\nsubstantial pairs of data consisting of local text prompts and corresponding\nvisual instances from a vast image data, leveraging publicly available\npre-trained large models. Extensive experiments and various application\nscenarios demonstrate the superiority of LAR-Gen in terms of both identity\npreservation and text semantic consistency.\n","authors":["Yulin Pan","Chaojie Mao","Zeyinzi Jiang","Zhen Han","Jingfeng Zhang","Xiangteng He"],"pdf_url":"https://arxiv.org/pdf/2403.19534v2.pdf","comment":"11 pages, 17 figures"},{"id":"http://arxiv.org/abs/2501.08819v2","updated":"2025-01-22T15:34:51Z","published":"2025-01-15T14:17:13Z","title":"Boosting Diffusion Guidance via Learning Degradation-Aware Models for\n Blind Super Resolution","summary":" Recently, diffusion-based blind super-resolution (SR) methods have shown\ngreat ability to generate high-resolution images with abundant high-frequency\ndetail, but the detail is often achieved at the expense of fidelity. Meanwhile,\nanother line of research focusing on rectifying the reverse process of\ndiffusion models (i.e., diffusion guidance), has demonstrated the power to\ngenerate high-fidelity results for non-blind SR. However, these methods rely on\nknown degradation kernels, making them difficult to apply to blind SR. To\naddress these issues, we present DADiff in this paper. DADiff incorporates\ndegradation-aware models into the diffusion guidance framework, eliminating the\nneed to know degradation kernels. Additionally, we propose two novel techniques\n-- input perturbation and guidance scalar -- to further improve our\nperformance. Extensive experimental results show that our proposed method has\nsuperior performance over state-of-the-art methods on blind SR benchmarks.\n","authors":["Shao-Hao Lu","Ren Wang","Ching-Chun Huang","Wei-Chen Chiu"],"pdf_url":"https://arxiv.org/pdf/2501.08819v2.pdf","comment":"To appear in WACV 2025. Code is available at:\n https://github.com/ryanlu2240/DADiff"},{"id":"http://arxiv.org/abs/2410.18977v2","updated":"2025-01-22T15:32:45Z","published":"2024-10-24T17:59:45Z","title":"Pay Attention and Move Better: Harnessing Attention for Interactive\n Motion Generation and Training-free Editing","summary":" This research delves into the problem of interactive editing of human motion\ngeneration. Previous motion diffusion models lack explicit modeling of the\nword-level text-motion correspondence and good explainability, hence\nrestricting their fine-grained editing ability. To address this issue, we\npropose an attention-based motion diffusion model, namely MotionCLR, with CLeaR\nmodeling of attention mechanisms. Technically, MotionCLR models the in-modality\nand cross-modality interactions with self-attention and cross-attention,\nrespectively. More specifically, the self-attention mechanism aims to measure\nthe sequential similarity between frames and impacts the order of motion\nfeatures. By contrast, the cross-attention mechanism works to find the\nfine-grained word-sequence correspondence and activate the corresponding\ntimesteps in the motion sequence. Based on these key properties, we develop a\nversatile set of simple yet effective motion editing methods via manipulating\nattention maps, such as motion (de-)emphasizing, in-place motion replacement,\nand example-based motion generation, etc. For further verification of the\nexplainability of the attention mechanism, we additionally explore the\npotential of action-counting and grounded motion generation ability via\nattention maps. Our experimental results show that our method enjoys good\ngeneration and editing ability with good explainability.\n","authors":["Ling-Hao Chen","Shunlin Lu","Wenxun Dai","Zhiyang Dou","Xuan Ju","Jingbo Wang","Taku Komura","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.18977v2.pdf","comment":"Updated MotionCLR technical report"},{"id":"http://arxiv.org/abs/2501.12958v1","updated":"2025-01-22T15:32:07Z","published":"2025-01-22T15:32:07Z","title":"A Novel Tracking Framework for Devices in X-ray Leveraging Supplementary\n Cue-Driven Self-Supervised Features","summary":" To restore proper blood flow in blocked coronary arteries via angioplasty\nprocedure, accurate placement of devices such as catheters, balloons, and\nstents under live fluoroscopy or diagnostic angiography is crucial. Identified\nballoon markers help in enhancing stent visibility in X-ray sequences, while\nthe catheter tip aids in precise navigation and co-registering vessel\nstructures, reducing the need for contrast in angiography. However, accurate\ndetection of these devices in interventional X-ray sequences faces significant\nchallenges, particularly due to occlusions from contrasted vessels and other\ndevices and distractions from surrounding, resulting in the failure to track\nsuch small objects. While most tracking methods rely on spatial correlation of\npast and current appearance, they often lack strong motion comprehension\nessential for navigating through these challenging conditions, and fail to\neffectively detect multiple instances in the scene. To overcome these\nlimitations, we propose a self-supervised learning approach that enhances its\nspatio-temporal understanding by incorporating supplementary cues and learning\nacross multiple representation spaces on a large dataset. Followed by that, we\nintroduce a generic real-time tracking framework that effectively leverages the\npretrained spatio-temporal network and also takes the historical appearance and\ntrajectory data into account. This results in enhanced localization of multiple\ninstances of device landmarks. Our method outperforms state-of-the-art methods\nin interventional X-ray device tracking, especially stability and robustness,\nachieving an 87% reduction in max error for balloon marker detection and a 61%\nreduction in max error for catheter tip detection.\n","authors":["Saahil Islam","Venkatesh N. Murthy","Dominik Neumann","Serkan Cimen","Puneet Sharma","Andreas Maier","Dorin Comaniciu","Florin C. Ghesu"],"pdf_url":"https://arxiv.org/pdf/2501.12958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18431v2","updated":"2025-01-22T15:09:00Z","published":"2024-09-27T03:44:07Z","title":"Search3D: Hierarchical Open-Vocabulary 3D Segmentation","summary":" Open-vocabulary 3D segmentation enables exploration of 3D spaces using\nfree-form text descriptions. Existing methods for open-vocabulary 3D instance\nsegmentation primarily focus on identifying object-level instances but struggle\nwith finer-grained scene entities such as object parts, or regions described by\ngeneric attributes. In this work, we introduce Search3D, an approach to\nconstruct hierarchical open-vocabulary 3D scene representations, enabling 3D\nsearch at multiple levels of granularity: fine-grained object parts, entire\nobjects, or regions described by attributes like materials. Unlike prior\nmethods, Search3D shifts towards a more flexible open-vocabulary 3D search\nparadigm, moving beyond explicit object-centric queries. For systematic\nevaluation, we further contribute a scene-scale open-vocabulary 3D part\nsegmentation benchmark based on MultiScan, along with a set of open-vocabulary\nfine-grained part annotations on ScanNet++. Search3D outperforms baselines in\nscene-scale open-vocabulary 3D part segmentation, while maintaining strong\nperformance in segmenting 3D objects and materials. Our project page is\nhttp://search3d-segmentation.github.io.\n","authors":["Ayca Takmaz","Alexandros Delitzas","Robert W. Sumner","Francis Engelmann","Johanna Wald","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2409.18431v2.pdf","comment":"This manuscript is provided as a pre-print, it has been accepted for\n publication by IEEE RA-L"},{"id":"http://arxiv.org/abs/2501.12935v1","updated":"2025-01-22T15:06:30Z","published":"2025-01-22T15:06:30Z","title":"3D Object Manipulation in a Single Image using Generative Models","summary":" Object manipulation in images aims to not only edit the object's presentation\nbut also gift objects with motion. Previous methods encountered challenges in\nconcurrently handling static editing and dynamic generation, while also\nstruggling to achieve fidelity in object appearance and scene lighting. In this\nwork, we introduce \\textbf{OMG3D}, a novel framework that integrates the\nprecise geometric control with the generative power of diffusion models, thus\nachieving significant enhancements in visual performance. Our framework first\nconverts 2D objects into 3D, enabling user-directed modifications and lifelike\nmotions at the geometric level. To address texture realism, we propose\nCustomRefiner, a texture refinement module that pre-train a customized\ndiffusion model, aligning the details and style of coarse renderings of 3D\nrough model with the original image, further refine the texture. Additionally,\nwe introduce IllumiCombiner, a lighting processing module that estimates and\ncorrects background lighting to match human visual perception, resulting in\nmore realistic shadow effects. Extensive experiments demonstrate the\noutstanding visual performance of our approach in both static and dynamic\nscenarios. Remarkably, all these steps can be done using one NVIDIA 3090.\nProject page is at https://whalesong-zrs.github.io/OMG3D-projectpage/\n","authors":["Ruisi Zhao","Zechuan Zhang","Zongxin Yang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2501.12935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12931v1","updated":"2025-01-22T15:02:43Z","published":"2025-01-22T15:02:43Z","title":"DynamicEarth: How Far are We from Open-Vocabulary Change Detection?","summary":" Monitoring Earth's evolving land covers requires methods capable of detecting\nchanges across a wide range of categories and contexts. Existing change\ndetection methods are hindered by their dependency on predefined classes,\nreducing their effectiveness in open-world applications. To address this issue,\nwe introduce open-vocabulary change detection (OVCD), a novel task that bridges\nvision and language to detect changes across any category. Considering the lack\nof high-quality data and annotation, we propose two training-free frameworks,\nM-C-I and I-M-C, which leverage and integrate off-the-shelf foundation models\nfor the OVCD task. The insight behind the M-C-I framework is to discover all\npotential changes and then classify these changes, while the insight of I-M-C\nframework is to identify all targets of interest and then determine whether\ntheir states have changed. Based on these two frameworks, we instantiate to\nobtain several methods, e.g., SAM-DINOv2-SegEarth-OV, Grounding-DINO-SAM2-DINO,\netc. Extensive evaluations on 5 benchmark datasets demonstrate the superior\ngeneralization and robustness of our OVCD methods over existing supervised and\nunsupervised methods. To support continued exploration, we release\nDynamicEarth, a dedicated codebase designed to advance research and application\nof OVCD. https://likyoo.github.io/DynamicEarth\n","authors":["Kaiyu Li","Xiangyong Cao","Yupeng Deng","Chao Pang","Zepeng Xin","Deyu Meng","Zhi Wang"],"pdf_url":"https://arxiv.org/pdf/2501.12931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04940v3","updated":"2025-01-22T14:45:33Z","published":"2024-08-09T08:46:22Z","title":"Capsule Vision 2024 Challenge: Multi-Class Abnormality Classification\n for Video Capsule Endoscopy","summary":" We present the Capsule Vision 2024 Challenge: Multi-Class Abnormality\nClassification for Video Capsule Endoscopy. It was virtually organized by the\nResearch Center for Medical Image Analysis and Artificial Intelligence (MIAAI),\nDepartment of Medicine, Danube Private University, Krems, Austria in\ncollaboration with the 9th International Conference on Computer Vision & Image\nProcessing (CVIP 2024) being organized by the Indian Institute of Information\nTechnology, Design and Manufacturing (IIITDM) Kancheepuram, Chennai, India.\nThis document provides an overview of the challenge, including the registration\nprocess, rules, submission format, description of the datasets used, qualified\nteam rankings, all team descriptions, and the benchmarking results reported by\nthe organizers.\n","authors":["Palak Handa","Amirreza Mahbod","Florian Schwarzhans","Ramona Woitek","Nidhi Goel","Manas Dhir","Deepti Chhabra","Shreshtha Jha","Pallavi Sharma","Vijay Thakur","Simarpreet Singh Chawla","Deepak Gunjan","Jagadeesh Kakarla","Balasubramanian Raman"],"pdf_url":"https://arxiv.org/pdf/2408.04940v3.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2501.12910v1","updated":"2025-01-22T14:37:01Z","published":"2025-01-22T14:37:01Z","title":"PreciseCam: Precise Camera Control for Text-to-Image Generation","summary":" Images as an artistic medium often rely on specific camera angles and lens\ndistortions to convey ideas or emotions; however, such precise control is\nmissing in current text-to-image models. We propose an efficient and general\nsolution that allows precise control over the camera when generating both\nphotographic and artistic images. Unlike prior methods that rely on predefined\nshots, we rely solely on four simple extrinsic and intrinsic camera parameters,\nremoving the need for pre-existing geometry, reference 3D objects, and\nmulti-view data. We also present a novel dataset with more than 57,000 images,\nalong with their text prompts and ground-truth camera parameters. Our\nevaluation shows precise camera control in text-to-image generation, surpassing\ntraditional prompt engineering approaches. Our data, model, and code are\npublicly available at https://graphics.unizar.es/projects/PreciseCam2024.\n","authors":["Edurne Bernal-Berdun","Ana Serrano","Belen Masia","Matheus Gadelha","Yannick Hold-Geoffroy","Xin Sun","Diego Gutierrez"],"pdf_url":"https://arxiv.org/pdf/2501.12910v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12898v1","updated":"2025-01-22T14:18:47Z","published":"2025-01-22T14:18:47Z","title":"DocTTT: Test-Time Training for Handwritten Document Recognition Using\n Meta-Auxiliary Learning","summary":" Despite recent significant advancements in Handwritten Document Recognition\n(HDR), the efficient and accurate recognition of text against complex\nbackgrounds, diverse handwriting styles, and varying document layouts remains a\npractical challenge. Moreover, this issue is seldom addressed in academic\nresearch, particularly in scenarios with minimal annotated data available. In\nthis paper, we introduce the DocTTT framework to address these challenges. The\nkey innovation of our approach is that it uses test-time training to adapt the\nmodel to each specific input during testing. We propose a novel Meta-Auxiliary\nlearning approach that combines Meta-learning and self-supervised Masked\nAutoencoder~(MAE). During testing, we adapt the visual representation\nparameters using a self-supervised MAE loss. During training, we learn the\nmodel parameters using a meta-learning framework, so that the model parameters\nare learned to adapt to a new input effectively. Experimental results show that\nour proposed method significantly outperforms existing state-of-the-art\napproaches on benchmark datasets.\n","authors":["Wenhao Gu","Li Gu","Ziqiang Wang","Ching Yee Suen","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2501.12898v1.pdf","comment":"WACV2025, camera ready with updated reference"},{"id":"http://arxiv.org/abs/2306.07346v2","updated":"2025-01-22T13:28:01Z","published":"2023-06-12T18:12:19Z","title":"Learning to Mask and Permute Visual Tokens for Vision Transformer\n Pre-Training","summary":" The use of self-supervised pre-training has emerged as a promising approach\nto enhance the performance of many different visual tasks. In this context,\nrecent approaches have employed the Masked Image Modeling paradigm, which\npre-trains a backbone by reconstructing visual tokens associated with randomly\nmasked image patches. This masking approach, however, introduces noise into the\ninput data during pre-training, leading to discrepancies that can impair\nperformance during the fine-tuning phase. Furthermore, input masking neglects\nthe dependencies between corrupted patches, increasing the inconsistencies\nobserved in downstream fine-tuning tasks. To overcome these issues, we propose\na new self-supervised pre-training approach, named Masked and Permuted Vision\nTransformer (MaPeT), that employs autoregressive and permuted predictions to\ncapture intra-patch dependencies. In addition, MaPeT employs auxiliary\npositional information to reduce the disparity between the pre-training and\nfine-tuning phases. In our experiments, we employ a fair setting to ensure\nreliable and meaningful comparisons and conduct investigations on multiple\nvisual tokenizers, including our proposed $k$-CLIP which directly employs\ndiscretized CLIP features. Our results demonstrate that MaPeT achieves\ncompetitive performance on ImageNet, compared to baselines and competitors\nunder the same model setting. We release an implementation of our code and\nmodels at https://github.com/aimagelab/MaPeT.\n","authors":["Lorenzo Baraldi","Roberto Amoroso","Marcella Cornia","Lorenzo Baraldi","Andrea Pilzer","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2306.07346v2.pdf","comment":"Computer Vision and Image Understanding (2025)"},{"id":"http://arxiv.org/abs/2501.12860v1","updated":"2025-01-22T13:13:41Z","published":"2025-01-22T13:13:41Z","title":"CrossDiff: Diffusion Probabilistic Model With Cross-conditional\n Encoder-Decoder for Crack Segmentation","summary":" Crack Segmentation in industrial concrete surfaces is a challenging task\nbecause cracks usually exhibit intricate morphology with slender appearances.\nTraditional segmentation methods often struggle to accurately locate such\ncracks, leading to inefficiencies in maintenance and repair processes. In this\npaper, we propose a novel diffusion-based model with a cross-conditional\nencoder-decoder, named CrossDiff, which is the first to introduce the diffusion\nprobabilistic model for the crack segmentation task. Specifically, CrossDiff\nintegrates a cross-encoder and a cross-decoder into the diffusion model to\nconstitute a cross-shaped diffusion model structure. The cross-encoder enhances\nthe ability to retain crack details and the cross-decoder helps extract the\nsemantic features of cracks. As a result, CrossDiff can better handle slender\ncracks. Extensive experiments were conducted on five challenging crack datasets\nincluding CFD, CrackTree200, DeepCrack, GAPs384, and Rissbilder. The results\ndemonstrate that the proposed CrossDiff model achieves impressive performance,\noutperforming other state-of-the-art methods by 8.0% in terms of both Dice\nscore and IoU. The code will be open-source soon.\n","authors":["Xianglong Shi","Yunhan Jiang","Xiaoheng Jiang","Mingling Xu","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2501.12860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05435v7","updated":"2025-01-22T12:51:33Z","published":"2024-03-08T16:38:11Z","title":"OmniCount: Multi-label Object Counting with Semantic-Geometric Priors","summary":" Object counting is pivotal for understanding the composition of scenes.\nPreviously, this task was dominated by class-specific methods, which have\ngradually evolved into more adaptable class-agnostic strategies. However, these\nstrategies come with their own set of limitations, such as the need for manual\nexemplar input and multiple passes for multiple categories, resulting in\nsignificant inefficiencies. This paper introduces a more practical approach\nenabling simultaneous counting of multiple object categories using an\nopen-vocabulary framework. Our solution, OmniCount, stands out by using\nsemantic and geometric insights (priors) from pre-trained models to count\nmultiple categories of objects as specified by users, all without additional\ntraining. OmniCount distinguishes itself by generating precise object masks and\nleveraging varied interactive prompts via the Segment Anything Model for\nefficient counting. To evaluate OmniCount, we created the OmniCount-191\nbenchmark, a first-of-its-kind dataset with multi-label object counts,\nincluding points, bounding boxes, and VQA annotations. Our comprehensive\nevaluation in OmniCount-191, alongside other leading benchmarks, demonstrates\nOmniCount's exceptional performance, significantly outpacing existing\nsolutions. The project webpage is available at\nhttps://mondalanindya.github.io/OmniCount.\n","authors":["Anindya Mondal","Sauradip Nag","Xiatian Zhu","Anjan Dutta"],"pdf_url":"https://arxiv.org/pdf/2403.05435v7.pdf","comment":"Accepted to AAAI 2025"},{"id":"http://arxiv.org/abs/2501.12844v1","updated":"2025-01-22T12:45:09Z","published":"2025-01-22T12:45:09Z","title":"GAMED-Snake: Gradient-aware Adaptive Momentum Evolution Deep Snake Model\n for Multi-organ Segmentation","summary":" Multi-organ segmentation is a critical yet challenging task due to complex\nanatomical backgrounds, blurred boundaries, and diverse morphologies. This\nstudy introduces the Gradient-aware Adaptive Momentum Evolution Deep Snake\n(GAMED-Snake) model, which establishes a novel paradigm for contour-based\nsegmentation by integrating gradient-based learning with adaptive momentum\nevolution mechanisms. The GAMED-Snake model incorporates three major\ninnovations: First, the Distance Energy Map Prior (DEMP) generates a\npixel-level force field that effectively attracts contour points towards the\ntrue boundaries, even in scenarios with complex backgrounds and blurred edges.\nSecond, the Differential Convolution Inception Module (DCIM) precisely extracts\ncomprehensive energy gradients, significantly enhancing segmentation accuracy.\nThird, the Adaptive Momentum Evolution Mechanism (AMEM) employs cross-attention\nto establish dynamic features across different iterations of evolution,\nenabling precise boundary alignment for diverse morphologies. Experimental\nresults on four challenging multi-organ segmentation datasets demonstrate that\nGAMED-Snake improves the mDice metric by approximately 2% compared to\nstate-of-the-art methods. Code will be available at\nhttps://github.com/SYSUzrc/GAMED-Snake.\n","authors":["Ruicheng Zhang","Haowei Guo","Zeyu Zhang","Puxin Yan","Shen Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.12844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10202v3","updated":"2025-01-22T12:35:03Z","published":"2024-08-19T17:57:28Z","title":"SANER: Annotation-free Societal Attribute Neutralizer for Debiasing CLIP","summary":" Large-scale vision-language models, such as CLIP, are known to contain\nsocietal bias regarding protected attributes (e.g., gender, age). This paper\naims to address the problems of societal bias in CLIP. Although previous\nstudies have proposed to debias societal bias through adversarial learning or\ntest-time projecting, our comprehensive study of these works identifies two\ncritical limitations: 1) loss of attribute information when it is explicitly\ndisclosed in the input and 2) use of the attribute annotations during debiasing\nprocess. To mitigate societal bias in CLIP and overcome these limitations\nsimultaneously, we introduce a simple-yet-effective debiasing method called\nSANER (societal attribute neutralizer) that eliminates attribute information\nfrom CLIP text features only of attribute-neutral descriptions. Experimental\nresults show that SANER, which does not require attribute annotations and\npreserves original information for attribute-specific descriptions,\ndemonstrates superior debiasing ability than the existing methods.\nAdditionally, we observe that SANER does not require retraining CLIP from\nscratch with the original dataset. Moreover, the debiased model can be directly\napplied to the text-to-image generation model by simply replacing the text\nencoder.\n","authors":["Yusuke Hirota","Min-Hung Chen","Chien-Yi Wang","Yuta Nakashima","Yu-Chiang Frank Wang","Ryo Hachiuma"],"pdf_url":"https://arxiv.org/pdf/2408.10202v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12840v1","updated":"2025-01-22T12:29:33Z","published":"2025-01-22T12:29:33Z","title":"AMM-Diff: Adaptive Multi-Modality Diffusion Network for Missing Modality\n Imputation","summary":" In clinical practice, full imaging is not always feasible, often due to\ncomplex acquisition protocols, stringent privacy regulations, or specific\nclinical needs. However, missing MR modalities pose significant challenges for\ntasks like brain tumor segmentation, especially in deep learning-based\nsegmentation, as each modality provides complementary information crucial for\nimproving accuracy. A promising solution is missing data imputation, where\nabsent modalities are generated from available ones. While generative models\nhave been widely used for this purpose, most state-of-the-art approaches are\nlimited to single or dual target translations, lacking the adaptability to\ngenerate missing modalities based on varying input configurations. To address\nthis, we propose an Adaptive Multi-Modality Diffusion Network (AMM-Diff), a\nnovel diffusion-based generative model capable of handling any number of input\nmodalities and generating the missing ones. We designed an Image-Frequency\nFusion Network (IFFN) that learns a unified feature representation through a\nself-supervised pretext task across the full input modalities and their\nselected high-frequency Fourier components. The proposed diffusion model\nleverages this representation, encapsulating prior knowledge of the complete\nmodalities, and combines it with an adaptive reconstruction strategy to achieve\nmissing modality completion. Experimental results on the BraTS 2021 dataset\ndemonstrate the effectiveness of our approach.\n","authors":["Aghiles Kebaili","Jérôme Lapuyade-Lahorgue","Pierre Vera","Su Ruan"],"pdf_url":"https://arxiv.org/pdf/2501.12840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12832v1","updated":"2025-01-22T12:19:47Z","published":"2025-01-22T12:19:47Z","title":"FDG-Diff: Frequency-Domain-Guided Diffusion Framework for Compressed\n Hazy Image Restoration","summary":" In this study, we reveal that the interaction between haze degradation and\nJPEG compression introduces complex joint loss effects, which significantly\ncomplicate image restoration. Existing dehazing models often neglect\ncompression effects, which limits their effectiveness in practical\napplications. To address these challenges, we introduce three key\ncontributions. First, we design FDG-Diff, a novel frequency-domain-guided\ndehazing framework that improves JPEG image restoration by leveraging\nfrequency-domain information. Second, we introduce the High-Frequency\nCompensation Module (HFCM), which enhances spatial-domain detail restoration by\nincorporating frequency-domain augmentation techniques into a diffusion-based\nrestoration framework. Lastly, the introduction of the Degradation-Aware\nDenoising Timestep Predictor (DADTP) module further enhances restoration\nquality by enabling adaptive region-specific restoration, effectively\naddressing regional degradation inconsistencies in compressed hazy images.\nExperimental results across multiple compressed dehazing datasets demonstrate\nthat our method consistently outperforms the latest state-of-the-art\napproaches. Code be available at https://github.com/SYSUzrc/FDG-Diff.\n","authors":["Ruicheng Zhang","Kanghui Tian","Zeyu Zhang","Qixiang Liu","Zhi Jin"],"pdf_url":"https://arxiv.org/pdf/2501.12832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12386v2","updated":"2025-01-22T12:08:20Z","published":"2025-01-21T18:59:00Z","title":"InternVideo2.5: Empowering Video MLLMs with Long and Rich Context\n Modeling","summary":" This paper aims to improve the performance of video multimodal large language\nmodels (MLLM) via long and rich context (LRC) modeling. As a result, we develop\na new version of InternVideo2.5 with a focus on enhancing the original MLLMs'\nability to perceive fine-grained details and capture long-form temporal\nstructure in videos. Specifically, our approach incorporates dense vision task\nannotations into MLLMs using direct preference optimization and develops\ncompact spatiotemporal representations through adaptive hierarchical token\ncompression. Experimental results demonstrate this unique design of LRC greatly\nimproves the results of video MLLM in mainstream video understanding benchmarks\n(short & long), enabling the MLLM to memorize significantly longer video inputs\n(at least 6x longer than the original), and master specialized vision\ncapabilities like object tracking and segmentation. Our work highlights the\nimportance of multimodal context richness (length and fineness) in empowering\nMLLM's innate abilites (focus and memory), providing new insights for future\nresearch on video MLLM. Code and models are available at\nhttps://github.com/OpenGVLab/InternVideo/tree/main/InternVideo2.5\n","authors":["Yi Wang","Xinhao Li","Ziang Yan","Yinan He","Jiashuo Yu","Xiangyu Zeng","Chenting Wang","Changlian Ma","Haian Huang","Jianfei Gao","Min Dou","Kai Chen","Wenhai Wang","Yu Qiao","Yali Wang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2501.12386v2.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2501.12824v1","updated":"2025-01-22T12:04:58Z","published":"2025-01-22T12:04:58Z","title":"Enhancing Monocular Depth Estimation with Multi-Source Auxiliary Tasks","summary":" Monocular depth estimation (MDE) is a challenging task in computer vision,\noften hindered by the cost and scarcity of high-quality labeled datasets. We\ntackle this challenge using auxiliary datasets from related vision tasks for an\nalternating training scheme with a shared decoder built on top of a pre-trained\nvision foundation model, while giving a higher weight to MDE. Through extensive\nexperiments we demonstrate the benefits of incorporating various in-domain\nauxiliary datasets and tasks to improve MDE quality on average by ~11%. Our\nexperimental analysis shows that auxiliary tasks have different impacts,\nconfirming the importance of task selection, highlighting that quality gains\nare not achieved by merely adding data. Remarkably, our study reveals that\nusing semantic segmentation datasets as Multi-Label Dense Classification (MLDC)\noften results in additional quality gains. Lastly, our method significantly\nimproves the data efficiency for the considered MDE datasets, enhancing their\nquality while reducing their size by at least 80%. This paves the way for using\nauxiliary data from related tasks to improve MDE quality despite limited\navailability of high-quality labeled data. Code is available at\nhttps://jugit.fz-juelich.de/ias-8/mdeaux.\n","authors":["Alessio Quercia","Erenus Yildiz","Zhuo Cao","Kai Krajsek","Abigail Morrison","Ira Assent","Hanno Scharr"],"pdf_url":"https://arxiv.org/pdf/2501.12824v1.pdf","comment":"Paper accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2501.12202v2","updated":"2025-01-22T12:01:39Z","published":"2025-01-21T15:16:54Z","title":"Hunyuan3D 2.0: Scaling Diffusion Models for High Resolution Textured 3D\n Assets Generation","summary":" We present Hunyuan3D 2.0, an advanced large-scale 3D synthesis system for\ngenerating high-resolution textured 3D assets. This system includes two\nfoundation components: a large-scale shape generation model -- Hunyuan3D-DiT,\nand a large-scale texture synthesis model -- Hunyuan3D-Paint. The shape\ngenerative model, built on a scalable flow-based diffusion transformer, aims to\ncreate geometry that properly aligns with a given condition image, laying a\nsolid foundation for downstream applications. The texture synthesis model,\nbenefiting from strong geometric and diffusion priors, produces high-resolution\nand vibrant texture maps for either generated or hand-crafted meshes.\nFurthermore, we build Hunyuan3D-Studio -- a versatile, user-friendly production\nplatform that simplifies the re-creation process of 3D assets. It allows both\nprofessional and amateur users to manipulate or even animate their meshes\nefficiently. We systematically evaluate our models, showing that Hunyuan3D 2.0\noutperforms previous state-of-the-art models, including the open-source models\nand closed-source models in geometry details, condition alignment, texture\nquality, and etc. Hunyuan3D 2.0 is publicly released in order to fill the gaps\nin the open-source 3D community for large-scale foundation generative models.\nThe code and pre-trained weights of our models are available at:\nhttps://github.com/Tencent/Hunyuan3D-2\n","authors":["Zibo Zhao","Zeqiang Lai","Qingxiang Lin","Yunfei Zhao","Haolin Liu","Shuhui Yang","Yifei Feng","Mingxin Yang","Sheng Zhang","Xianghui Yang","Huiwen Shi","Sicong Liu","Junta Wu","Yihang Lian","Fan Yang","Ruining Tang","Zebin He","Xinzhou Wang","Jian Liu","Xuhui Zuo","Zhuo Chen","Biwen Lei","Haohan Weng","Jing Xu","Yiling Zhu","Xinhai Liu","Lixin Xu","Changrong Hu","Tianyu Huang","Lifu Wang","Jihong Zhang","Meng Chen","Liang Dong","Yiwen Jia","Yulin Cai","Jiaao Yu","Yixuan Tang","Hao Zhang","Zheng Ye","Peng He","Runzhou Wu","Chao Zhang","Yonghao Tan","Jie Xiao","Yangyu Tao","Jianchen Zhu","Jinbao Xue","Kai Liu","Chongqing Zhao","Xinming Wu","Zhichao Hu","Lei Qin","Jianbing Peng","Zhan Li","Minghui Chen","Xipeng Zhang","Lin Niu","Paige Wang","Yingkai Wang","Haozhao Kuang","Zhongyi Fan","Xu Zheng","Weihao Zhuang","YingPing He","Tian Liu","Yong Yang","Di Wang","Yuhong Liu","Jie Jiang","Jingwei Huang","Chunchao Guo"],"pdf_url":"https://arxiv.org/pdf/2501.12202v2.pdf","comment":"GitHub link: https://github.com/Tencent/Hunyuan3D-2"},{"id":"http://arxiv.org/abs/2501.12810v1","updated":"2025-01-22T11:41:41Z","published":"2025-01-22T11:41:41Z","title":"Machine Learning Modeling for Multi-order Human Visual Motion Processing","summary":" Our research aims to develop machines that learn to perceive visual motion as\ndo humans. While recent advances in computer vision (CV) have enabled DNN-based\nmodels to accurately estimate optical flow in naturalistic images, a\nsignificant disparity remains between CV models and the biological visual\nsystem in both architecture and behavior. This disparity includes humans'\nability to perceive the motion of higher-order image features (second-order\nmotion), which many CV models fail to capture because of their reliance on the\nintensity conservation law. Our model architecture mimics the cortical V1-MT\nmotion processing pathway, utilizing a trainable motion energy sensor bank and\na recurrent graph network. Supervised learning employing diverse naturalistic\nvideos allows the model to replicate psychophysical and physiological findings\nabout first-order (luminance-based) motion perception. For second-order motion,\ninspired by neuroscientific findings, the model includes an additional sensing\npathway with nonlinear preprocessing before motion energy sensing, implemented\nusing a simple multilayer 3D CNN block. When exploring how the brain acquired\nthe ability to perceive second-order motion in natural environments, in which\npure second-order signals are rare, we hypothesized that second-order\nmechanisms were critical when estimating robust object motion amidst optical\nfluctuations, such as highlights on glossy surfaces. We trained our\ndual-pathway model on novel motion datasets with varying material properties of\nmoving objects. We found that training to estimate object motion from\nnon-Lambertian materials naturally endowed the model with the capacity to\nperceive second-order motion, as can humans. The resulting model effectively\naligns with biological systems while generalizing to both first- and\nsecond-order motion phenomena in natural scenes.\n","authors":["Zitang Sun","Yen-Ju Chen","Yung-Hao Yang","Yuan Li","Shin'ya Nishida"],"pdf_url":"https://arxiv.org/pdf/2501.12810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12375v2","updated":"2025-01-22T11:33:54Z","published":"2025-01-21T18:53:30Z","title":"Video Depth Anything: Consistent Depth Estimation for Super-Long Videos","summary":" Depth Anything has achieved remarkable success in monocular depth estimation\nwith strong generalization ability. However, it suffers from temporal\ninconsistency in videos, hindering its practical applications. Various methods\nhave been proposed to alleviate this issue by leveraging video generation\nmodels or introducing priors from optical flow and camera poses. Nonetheless,\nthese methods are only applicable to short videos (< 10 seconds) and require a\ntrade-off between quality and computational efficiency. We propose Video Depth\nAnything for high-quality, consistent depth estimation in super-long videos\n(over several minutes) without sacrificing efficiency. We base our model on\nDepth Anything V2 and replace its head with an efficient spatial-temporal head.\nWe design a straightforward yet effective temporal consistency loss by\nconstraining the temporal depth gradient, eliminating the need for additional\ngeometric priors. The model is trained on a joint dataset of video depth and\nunlabeled images, similar to Depth Anything V2. Moreover, a novel\nkey-frame-based strategy is developed for long video inference. Experiments\nshow that our model can be applied to arbitrarily long videos without\ncompromising quality, consistency, or generalization ability. Comprehensive\nevaluations on multiple video benchmarks demonstrate that our approach sets a\nnew state-of-the-art in zero-shot video depth estimation. We offer models of\ndifferent scales to support a range of scenarios, with our smallest model\ncapable of real-time performance at 30 FPS.\n","authors":["Sili Chen","Hengkai Guo","Shengnan Zhu","Feihu Zhang","Zilong Huang","Jiashi Feng","Bingyi Kang"],"pdf_url":"https://arxiv.org/pdf/2501.12375v2.pdf","comment":"Project page: https://videodepthanything.github.io/"},{"id":"http://arxiv.org/abs/2312.08704v2","updated":"2025-01-22T11:30:49Z","published":"2023-12-14T07:43:53Z","title":"PairingNet: A Learning-based Pair-searching and -matching Network for\n Image Fragments","summary":" In this paper, we propose a learning-based image fragment pair-searching and\n-matching approach to solve the challenging restoration problem. Existing works\nuse rule-based methods to match similar contour shapes or textures, which are\nalways difficult to tune hyperparameters for extensive data and computationally\ntime-consuming. Therefore, we propose a neural network that can effectively\nutilize neighbor textures with contour shape information to fundamentally\nimprove performance. First, we employ a graph-based network to extract the\nlocal contour and texture features of fragments. Then, for the pair-searching\ntask, we adopt a linear transformer-based module to integrate these local\nfeatures and use contrastive loss to encode the global features of each\nfragment. For the pair-matching task, we design a weighted fusion module to\ndynamically fuse extracted local contour and texture features, and formulate a\nsimilarity matrix for each pair of fragments to calculate the matching score\nand infer the adjacent segment of contours. To faithfully evaluate our proposed\nnetwork, we created a new image fragment dataset through an algorithm we\ndesigned that tears complete images into irregular fragments. The experimental\nresults show that our proposed network achieves excellent pair-searching\naccuracy, reduces matching errors, and significantly reduces computational\ntime. Details, sourcecode, and data are available in our supplementary\nmaterial.\n","authors":["Rixin Zhou","Ding Xia","Yi Zhang","Honglin Pang","Xi Yang","Chuntao Li"],"pdf_url":"https://arxiv.org/pdf/2312.08704v2.pdf","comment":"25 pages, 19 figures, 6 tables"},{"id":"http://arxiv.org/abs/2408.14358v2","updated":"2025-01-22T11:07:57Z","published":"2024-08-26T15:32:31Z","title":"An Embedding is Worth a Thousand Noisy Labels","summary":" The performance of deep neural networks scales with dataset size and label\nquality, rendering the efficient mitigation of low-quality data annotations\ncrucial for building robust and cost-effective systems. Existing strategies to\naddress label noise exhibit severe limitations due to computational complexity\nand application dependency. In this work, we propose WANN, a Weighted Adaptive\nNearest Neighbor approach that builds on self-supervised feature\nrepresentations obtained from foundation models. To guide the weighted voting\nscheme, we introduce a reliability score, which measures the likelihood of a\ndata label being correct. WANN outperforms reference methods, including a\nlinear layer trained with robust loss functions, on diverse datasets of varying\nsize and under various noise types and severities. WANN also exhibits superior\ngeneralization on imbalanced data compared to both Adaptive-NNs (ANN) and fixed\nk-NNs. Furthermore, the proposed weighting scheme enhances supervised\ndimensionality reduction under noisy labels. This yields a significant boost in\nclassification performance with 10x and 100x smaller image embeddings,\nminimizing latency and storage requirements. Our approach, emphasizing\nefficiency and explainability, emerges as a simple, robust solution to overcome\ninherent limitations of deep neural network training. The code is available at\nhttps://github.com/francescodisalvo05/wann-noisy-labels .\n","authors":["Francesco Di Salvo","Sebastian Doerrich","Ines Rieger","Christian Ledig"],"pdf_url":"https://arxiv.org/pdf/2408.14358v2.pdf","comment":"Preprint - Under Review"},{"id":"http://arxiv.org/abs/2309.05406v5","updated":"2025-01-22T10:56:19Z","published":"2023-09-11T12:12:52Z","title":"Treatment-aware Diffusion Probabilistic Model for Longitudinal MRI\n Generation and Diffuse Glioma Growth Prediction","summary":" Diffuse gliomas are malignant brain tumors that grow widespread through the\nbrain. The complex interactions between neoplastic cells and normal tissue, as\nwell as the treatment-induced changes often encountered, make glioma tumor\ngrowth modeling challenging. In this paper, we present a novel end-to-end\nnetwork capable of future predictions of tumor masks and multi-parametric\nmagnetic resonance images (MRI) of how the tumor will look at any future time\npoints for different treatment plans. Our approach is based on cutting-edge\ndiffusion probabilistic models and deep-segmentation neural networks. We\nincluded sequential multi-parametric MRI and treatment information as\nconditioning inputs to guide the generative diffusion process as well as a\njoint segmentation process. This allows for tumor growth estimates and\nrealistic MRI generation at any given treatment and time point. We trained the\nmodel using real-world postoperative longitudinal MRI data with glioma tumor\ngrowth trajectories represented as tumor segmentation maps over time. The model\ndemonstrates promising performance across various tasks, including generating\nhigh-quality multi-parametric MRI with tumor masks, performing time-series\ntumor segmentations, and providing uncertainty estimates. Combined with the\ntreatment-aware generated MRI, the tumor growth predictions with uncertainty\nestimates can provide useful information for clinical decision-making.\n","authors":["Qinghui Liu","Elies Fuster-Garcia","Ivar Thokle Hovden","Bradley J MacIntosh","Edvard Grødem","Petter Brandal","Carles Lopez-Mateu","Donatas Sederevicius","Karoline Skogen","Till Schellhorn","Atle Bjørnerud","Kyrre Eeg Emblem"],"pdf_url":"https://arxiv.org/pdf/2309.05406v5.pdf","comment":"preprints in IEEE-TMI, 14 pages"},{"id":"http://arxiv.org/abs/2501.10733v2","updated":"2025-01-22T10:50:37Z","published":"2025-01-18T11:39:46Z","title":"A CNN-Transformer for Classification of Longitudinal 3D MRI Images -- A\n Case Study on Hepatocellular Carcinoma Prediction","summary":" Longitudinal MRI analysis is crucial for predicting disease outcomes,\nparticularly in chronic conditions like hepatocellular carcinoma (HCC), where\nearly detection can significantly influence treatment strategies and patient\nprognosis. Yet, due to challenges like limited data availability, subtle\nparenchymal changes, and the irregular timing of medical screenings, current\napproaches have so far focused on cross-sectional imaging data. To address\nthis, we propose HCCNet, a novel model architecture that integrates a 3D\nadaptation of the ConvNeXt CNN architecture with a Transformer encoder,\ncapturing both the intricate spatial features of 3D MRIs and the complex\ntemporal dependencies across different time points. HCCNet utilizes a two-stage\npre-training process tailored for longitudinal MRI data. The CNN backbone is\npre-trained using a self-supervised learning framework adapted for 3D MRIs,\nwhile the Transformer encoder is pre-trained with a sequence-order-prediction\ntask to enhance its understanding of disease progression over time. We\ndemonstrate the effectiveness of HCCNet by applying it to a cohort of liver\ncirrhosis patients undergoing regular MRI screenings for HCC surveillance. Our\nresults show that HCCNet significantly improves predictive accuracy and\nreliability over baseline models, providing a robust tool for personalized HCC\nsurveillance. The methodological approach presented in this paper is versatile\nand can be adapted to various longitudinal MRI screening applications. Its\nability to handle varying patient record lengths and irregular screening\nintervals establishes it as an invaluable framework for monitoring chronic\ndiseases, where timely and accurate disease prognosis is critical for effective\ntreatment planning.\n","authors":["Jakob Nolte","Maureen M. J. Guichelaar","Donald E. Bouman","Stephanie M. van den Berg","Maryam Amir Haeri"],"pdf_url":"https://arxiv.org/pdf/2501.10733v2.pdf","comment":"Submitted for publication to Biomedical Signal Processing and\n Control; Incorrect notation corrected"},{"id":"http://arxiv.org/abs/2403.16184v2","updated":"2025-01-22T10:30:32Z","published":"2024-03-24T15:02:24Z","title":"Predicate Debiasing in Vision-Language Models Integration for Scene\n Graph Generation Enhancement","summary":" Scene Graph Generation (SGG) provides basic language representation of visual\nscenes, requiring models to grasp complex and diverse semantics between\nobjects. This complexity and diversity in SGG leads to underrepresentation,\nwhere parts of triplet labels are rare or even unseen during training,\nresulting in imprecise predictions. To tackle this, we propose integrating the\npretrained Vision-language Models to enhance representation. However, due to\nthe gap between pretraining and SGG, direct inference of pretrained VLMs on SGG\nleads to severe bias, which stems from the imbalanced predicates distribution\nin the pretraining language set. To alleviate the bias, we introduce a novel LM\nEstimation to approximate the unattainable predicates distribution. Finally, we\nensemble the debiased VLMs with SGG models to enhance the representation, where\nwe design a certainty-aware indicator to score each sample and dynamically\nadjust the ensemble weights. Our training-free method effectively addresses the\npredicates bias in pretrained VLMs, enhances SGG's representation, and\nsignificantly improve the performance.\n","authors":["Yuxuan Wang","Xiaoyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2403.16184v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12761v1","updated":"2025-01-22T09:54:43Z","published":"2025-01-22T09:54:43Z","title":"Modality Unified Attack for Omni-Modality Person Re-Identification","summary":" Deep learning based person re-identification (re-id) models have been widely\nemployed in surveillance systems. Recent studies have demonstrated that\nblack-box single-modality and cross-modality re-id models are vulnerable to\nadversarial examples (AEs), leaving the robustness of multi-modality re-id\nmodels unexplored. Due to the lack of knowledge about the specific type of\nmodel deployed in the target black-box surveillance system, we aim to generate\nmodality unified AEs for omni-modality (single-, cross- and multi-modality)\nre-id models. Specifically, we propose a novel Modality Unified Attack method\nto train modality-specific adversarial generators to generate AEs that\neffectively attack different omni-modality models. A multi-modality model is\nadopted as the surrogate model, wherein the features of each modality are\nperturbed by metric disruption loss before fusion. To collapse the common\nfeatures of omni-modality models, Cross Modality Simulated Disruption approach\nis introduced to mimic the cross-modality feature embeddings by intentionally\nfeeding images to non-corresponding modality-specific subnetworks of the\nsurrogate model. Moreover, Multi Modality Collaborative Disruption strategy is\ndevised to facilitate the attacker to comprehensively corrupt the informative\ncontent of person images by leveraging a multi modality feature collaborative\nmetric disruption loss. Extensive experiments show that our MUA method can\neffectively attack the omni-modality re-id models, achieving 55.9%, 24.4%,\n49.0% and 62.7% mean mAP Drop Rate, respectively.\n","authors":["Yuan Bian","Min Liu","Yunqi Yi","Xueping Wang","Yunfeng Ma","Yaonan Wang"],"pdf_url":"https://arxiv.org/pdf/2501.12761v1.pdf","comment":"9 pages,3 figures"},{"id":"http://arxiv.org/abs/2303.14676v3","updated":"2025-01-22T09:50:01Z","published":"2023-03-26T10:50:16Z","title":"PDPP: Projected Diffusion for Procedure Planning in Instructional Videos","summary":" In this paper, we study the problem of procedure planning in instructional\nvideos, which aims to make a plan (i.e. a sequence of actions) given the\ncurrent visual observation and the desired goal. Previous works cast this as a\nsequence modeling problem and leverage either intermediate visual observations\nor language instructions as supervision to make autoregressive planning,\nresulting in complex learning schemes and expensive annotation costs. To avoid\nintermediate supervision annotation and error accumulation caused by planning\nautoregressively, we propose a diffusion-based framework, coined as PDPP, to\ndirectly model the whole action sequence distribution with task label as\nsupervision instead. Our core idea is to treat procedure planning as a\ndistribution fitting problem under the given observations, thus transform the\nplanning problem to a sampling process from this distribution during inference.\nThe diffusion-based modeling approach also effectively addresses the\nuncertainty issue in procedure planning. Based on PDPP, we further apply joint\ntraining to our framework to generate plans with varying horizon lengths using\na single model and reduce the number of training parameters required. We\ninstantiate our PDPP with three popular diffusion models and investigate a\nseries of condition-introducing methods in our framework, including condition\nembeddings, MoEs, two-stage prediction and Classifier-Free Guidance strategy.\nFinally, we apply our PDPP to the Visual Planners for human Assistance problem\nwhich requires the goal specified in natural language rather than visual\nobservation. We conduct experiments on challenging datasets of different scales\nand our PDPP model achieves the state-of-the-art performance on multiple\nmetrics, even compared with those strongly-supervised counterparts. These\nresults further demonstrates the effectiveness and generalization ability of\nour model.\n","authors":["Hanlin Wang","Yilu Wu","Sheng Guo","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2303.14676v3.pdf","comment":"Accepted as a highlight paper at CVPR 2023. Extension accepted by\n TPAMI. Code and trained models are available at\n https://github.com/MCG-NJU/PDPP"},{"id":"http://arxiv.org/abs/2405.14318v2","updated":"2025-01-22T09:47:08Z","published":"2024-05-23T08:43:09Z","title":"Adaptive Retention & Correction for Continual Learning","summary":" Continual learning, also known as lifelong learning or incremental learning,\nrefers to the process by which a model learns from a stream of incoming data\nover time. A common problem in continual learning is the classification layer's\nbias towards the most recent task. Traditionally, methods have relied on\nincorporating data from past tasks during training to mitigate this issue.\nHowever, the recent shift in continual learning to memory-free environments has\nrendered these approaches infeasible. In this study, we propose a solution\nfocused on the testing phase. We first introduce a simple Out-of-Task Detection\nmethod, OTD, designed to accurately identify samples from past tasks during\ntesting. Leveraging OTD, we then propose: (1) an Adaptive Retention mechanism\nfor dynamically tuning the classifier layer on past task data; (2) an Adaptive\nCorrection mechanism for revising predictions when the model classifies data\nfrom previous tasks into classes from the current task. We name our approach\nAdaptive Retention & Correction (ARC). While designed for memory-free\nenvironments, ARC also proves effective in memory-based settings. Extensive\nexperiments show that our proposed method can be plugged in to virtually any\nexisting continual learning approach without requiring any modifications to its\ntraining procedure. Specifically, when integrated with state-of-the-art\napproaches, ARC achieves an average performance increase of 2.7% and 2.6% on\nthe CIFAR-100 and Imagenet-R datasets, respectively.\n","authors":["Haoran Chen","Micah Goldblum","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2405.14318v2.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2501.12751v1","updated":"2025-01-22T09:39:05Z","published":"2025-01-22T09:39:05Z","title":"Patent Figure Classification using Large Vision-language Models","summary":" Patent figure classification facilitates faceted search in patent retrieval\nsystems, enabling efficient prior art search. Existing approaches have explored\npatent figure classification for only a single aspect and for aspects with a\nlimited number of concepts. In recent years, large vision-language models\n(LVLMs) have shown tremendous performance across numerous computer vision\ndownstream tasks, however, they remain unexplored for patent figure\nclassification. Our work explores the efficacy of LVLMs in patent figure visual\nquestion answering (VQA) and classification, focusing on zero-shot and few-shot\nlearning scenarios. For this purpose, we introduce new datasets, PatFigVQA and\nPatFigCLS, for fine-tuning and evaluation regarding multiple aspects of patent\nfigures~(i.e., type, projection, patent class, and objects). For a\ncomputational-effective handling of a large number of classes using LVLM, we\npropose a novel tournament-style classification strategy that leverages a\nseries of multiple-choice questions. Experimental results and comparisons of\nmultiple classification approaches based on LVLMs and Convolutional Neural\nNetworks (CNNs) in few-shot settings show the feasibility of the proposed\napproaches.\n","authors":["Sushil Awale","Eric Müller-Budack","Ralph Ewerth"],"pdf_url":"https://arxiv.org/pdf/2501.12751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12736v1","updated":"2025-01-22T09:12:16Z","published":"2025-01-22T09:12:16Z","title":"Bad-PFL: Exploring Backdoor Attacks against Personalized Federated\n Learning","summary":" Data heterogeneity and backdoor attacks rank among the most significant\nchallenges facing federated learning (FL). For data heterogeneity, personalized\nfederated learning (PFL) enables each client to maintain a private personalized\nmodel to cater to client-specific knowledge. Meanwhile, vanilla FL has proven\nvulnerable to backdoor attacks. However, recent advancements in PFL community\nhave demonstrated a potential immunity against such attacks. This paper\nexplores this intersection further, revealing that existing federated backdoor\nattacks fail in PFL because backdoors about manually designed triggers struggle\nto survive in personalized models. To tackle this, we design Bad-PFL, which\nemploys features from natural data as our trigger. As long as the model is\ntrained on natural data, it inevitably embeds the backdoor associated with our\ntrigger, ensuring its longevity in personalized models. Moreover, our trigger\nundergoes mutual reinforcement training with the model, further solidifying the\nbackdoor's durability and enhancing attack effectiveness. The large-scale\nexperiments across three benchmark datasets demonstrate the superior\nperformance of our attack against various PFL methods, even when equipped with\nstate-of-the-art defense mechanisms.\n","authors":["Mingyuan Fan","Zhanyi Hu","Fuyi Wang","Cen Chen"],"pdf_url":"https://arxiv.org/pdf/2501.12736v1.pdf","comment":"Accepted to ICLR 2025"},{"id":"http://arxiv.org/abs/2205.10003v4","updated":"2025-01-22T09:06:08Z","published":"2022-05-20T07:40:09Z","title":"InDistill: Information flow-preserving knowledge distillation for model\n compression","summary":" In this paper, we introduce InDistill, a method that serves as a warmup stage\nfor enhancing Knowledge Distillation (KD) effectiveness. InDistill focuses on\ntransferring critical information flow paths from a heavyweight teacher to a\nlightweight student. This is achieved via a training scheme based on curriculum\nlearning that considers the distillation difficulty of each layer and the\ncritical learning periods when the information flow paths are established. This\nprocedure can lead to a student model that is better prepared to learn from the\nteacher. To ensure the applicability of InDistill across a wide range of\nteacher-student pairs, we also incorporate a pruning operation when there is a\ndiscrepancy in the width of the teacher and student layers. This pruning\noperation reduces the width of the teacher's intermediate layers to match those\nof the student, allowing direct distillation without the need for an encoding\nstage. The proposed method is extensively evaluated using various pairs of\nteacher-student architectures on CIFAR-10, CIFAR-100, and ImageNet datasets\ndemonstrating that preserving the information flow paths consistently increases\nthe performance of the baseline KD approaches on both classification and\nretrieval settings. The code is available at\nhttps://github.com/gsarridis/InDistill.\n","authors":["Ioannis Sarridis","Christos Koutlis","Giorgos Kordopatis-Zilos","Ioannis Kompatsiaris","Symeon Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2205.10003v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16831v3","updated":"2025-01-22T08:45:56Z","published":"2024-03-25T14:57:18Z","title":"UrbanVLP: Multi-Granularity Vision-Language Pretraining for Urban\n Socioeconomic Indicator Prediction","summary":" Urban socioeconomic indicator prediction aims to infer various metrics\nrelated to sustainable development in diverse urban landscapes using\ndata-driven methods. However, prevalent pretrained models, particularly those\nreliant on satellite imagery, face dual challenges. Firstly, concentrating\nsolely on macro-level patterns from satellite data may introduce bias, lacking\nnuanced details at micro levels, such as architectural details at a place.\nSecondly, the text generated by the precursor work UrbanCLIP, which fully\nutilizes the extensive knowledge of LLMs, frequently exhibits issues such as\nhallucination and homogenization, resulting in a lack of reliable quality. In\nresponse to these issues, we devise a novel framework entitled UrbanVLP based\non Vision-Language Pretraining. Our UrbanVLP seamlessly integrates\nmulti-granularity information from both macro (satellite) and micro\n(street-view) levels, overcoming the limitations of prior pretrained models.\nMoreover, it introduces automatic text generation and calibration, providing a\nrobust guarantee for producing high-quality text descriptions of urban imagery.\nRigorous experiments conducted across six socioeconomic indicator prediction\ntasks underscore its superior performance.\n","authors":["Xixuan Hao","Wei Chen","Yibo Yan","Siru Zhong","Kun Wang","Qingsong Wen","Yuxuan Liang"],"pdf_url":"https://arxiv.org/pdf/2403.16831v3.pdf","comment":"Accepted as a full paper by AAAI'25 - AI for Social Impact Track"},{"id":"http://arxiv.org/abs/2501.10074v2","updated":"2025-01-22T08:36:33Z","published":"2025-01-17T09:46:27Z","title":"SpatialCoT: Advancing Spatial Reasoning through Coordinate Alignment and\n Chain-of-Thought for Embodied Task Planning","summary":" Spatial reasoning is an essential problem in embodied AI research. Efforts to\nenhance spatial reasoning abilities through supplementary spatial data and\nfine-tuning have proven limited and ineffective when addressing complex\nembodied tasks, largely due to their dependence on language-based outputs.\nWhile some approaches have introduced a point-based action space to mitigate\nthis issue, they fall short in managing more intricate tasks within complex\nenvironments. This deficiency arises from their failure to fully exploit the\ninherent thinking and reasoning capabilities that are fundamental strengths of\nVision-Language Models (VLMs). To address these limitations, we propose a novel\napproach named SpatialCoT, specifically designed to bolster the spatial\nreasoning capabilities of VLMs. Our approach comprises two stages: spatial\ncoordinate bi-directional alignment, which aligns vision-language inputs with\nspatial coordinates, and chain-of-thought spatial grounding, which harnesses\nthe reasoning capabilities of language models for advanced spatial reasoning.\nWe evaluate SpatialCoT on challenging navigation and manipulation tasks, both\nin simulation and real-world settings. Experimental results demonstrate that\nour method significantly outperforms previous state-of-the-art approaches in\nboth tasks.\n","authors":["Yuecheng Liu","Dafeng Chi","Shiguang Wu","Zhanguang Zhang","Yaochen Hu","Lingfeng Zhang","Yingxue Zhang","Shuang Wu","Tongtong Cao","Guowei Huang","Helong Huang","Guangjian Tian","Weichao Qiu","Xingyue Quan","Jianye Hao","Yuzheng Zhuang"],"pdf_url":"https://arxiv.org/pdf/2501.10074v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2501.12697v1","updated":"2025-01-22T08:14:11Z","published":"2025-01-22T08:14:11Z","title":"Combining Knowledge Graph and LLMs for Enhanced Zero-shot Visual\n Question Answering","summary":" Zero-shot visual question answering (ZS-VQA), an emerged critical research\narea, intends to answer visual questions without providing training samples.\nExisting research in ZS-VQA has proposed to leverage knowledge graphs or large\nlanguage models (LLMs), respectively, as external information sources to help\nVQA model comprehend images and questions. However, LLMs often struggle in\naccurately interpreting specific question meanings. Meanwhile, although\nknowledge graph has rich entity relationships, it is challenging to effectively\nconnect entities to individual image content for visual question answers. In\nthis paper, we propose a novel design to combine knowledge graph and LLMs for\nzero-shot visual question answer. Our approach uses LLMs' powerful\nunderstanding capabilities to accurately interpret image content through a\nstrategic question search mechanism. Meanwhile, the knowledge graph is used to\nexpand and connect users' queries to the image content for better visual\nquestion answering. An optimization algorithm is further used to determine the\noptimal weights for the loss functions derived from different information\nsources, towards a globally optimal set of candidate answers. Experimental\nresults on two benchmark datasets demonstrate that our model achieves\nstate-of-the-art (SOTA) performance. Both source code and benchmark data will\nbe released for public access.\n","authors":["Qian Tao","Xiaoyang Fan","Yong Xu","Xingquan Zhu","Yufei Tang"],"pdf_url":"https://arxiv.org/pdf/2501.12697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03189v3","updated":"2025-01-22T07:58:59Z","published":"2024-10-04T07:02:13Z","title":"Generalizable Prompt Tuning for Vision-Language Models","summary":" Prompt tuning for vision-language models such as CLIP involves optimizing the\ntext prompts used to generate image-text pairs for specific downstream tasks.\nWhile hand-crafted or template-based prompts are generally applicable to a\nwider range of unseen classes, they tend to perform poorly in downstream tasks\n(i.e., seen classes). Learnable soft prompts, on the other hand, often perform\nwell in downstream tasks but lack generalizability. Additionally, prior\nresearch has predominantly concentrated on the textual modality, with very few\nstudies attempting to explore the prompt's generalization potential from the\nvisual modality. Keeping these limitations in mind, we investigate how to\nprompt tuning to obtain both a competitive downstream performance and\ngeneralization. The study shows that by treating soft and hand-crafted prompts\nas dual views of the textual modality, and maximizing their mutual information,\nwe can better ensemble task-specific and general semantic information.\nMoreover, to generate more expressive prompts, the study introduces a\nclass-wise augmentation from the visual modality, resulting in significant\nrobustness to a wider range of unseen classes. Extensive evaluations on several\nbenchmarks report that the proposed approach achieves competitive results in\nterms of both task-specific performance and general abilities.\n","authors":["Qian Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.03189v3.pdf","comment":"in progress"},{"id":"http://arxiv.org/abs/2403.08479v2","updated":"2025-01-22T07:47:46Z","published":"2024-03-13T12:46:36Z","title":"MD-Dose: A diffusion model based on the Mamba for radiation dose\n prediction","summary":" Radiation therapy is crucial in cancer treatment. Experienced experts\ntypically iteratively generate high-quality dose distribution maps, forming the\nbasis for excellent radiation therapy plans. Therefore, automated prediction of\ndose distribution maps is significant in expediting the treatment process and\nproviding a better starting point for developing radiation therapy plans. With\nthe remarkable results of diffusion models in predicting high-frequency regions\nof dose distribution maps, dose prediction methods based on diffusion models\nhave been extensively studied. However, existing methods mainly utilize CNNs or\nTransformers as denoising networks. CNNs lack the capture of global receptive\nfields, resulting in suboptimal prediction performance. Transformers excel in\nglobal modeling but face quadratic complexity with image size, resulting in\nsignificant computational overhead. To tackle these challenges, we introduce a\nnovel diffusion model, MD-Dose, based on the Mamba architecture for predicting\nradiation therapy dose distribution in thoracic cancer patients. In the forward\nprocess, MD-Dose adds Gaussian noise to dose distribution maps to obtain pure\nnoise images. In the backward process, MD-Dose utilizes a noise predictor based\non the Mamba to predict the noise, ultimately outputting the dose distribution\nmaps. Furthermore, We develop a Mamba encoder to extract structural information\nand integrate it into the noise predictor for localizing dose regions in the\nplanning target volume (PTV) and organs at risk (OARs). Through extensive\nexperiments on a dataset of 300 thoracic tumor patients, we showcase the\nsuperiority of MD-Dose in various metrics and time consumption.\n","authors":["Linjie Fu","Xia Li","Xiuding Cai","Yingkai Wang","Xueyao Wang","Yali Shen","Yu Yao"],"pdf_url":"https://arxiv.org/pdf/2403.08479v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10232v2","updated":"2025-01-22T07:00:51Z","published":"2024-01-18T18:59:58Z","title":"ParaHome: Parameterizing Everyday Home Activities Towards 3D Generative\n Modeling of Human-Object Interactions","summary":" To enable machines to understand the way humans interact with the physical\nworld in daily life, 3D interaction signals should be captured in natural\nsettings, allowing people to engage with multiple objects in a range of\nsequential and casual manipulations. To achieve this goal, we introduce our\nParaHome system designed to capture dynamic 3D movements of humans and objects\nwithin a common home environment. Our system features a multi-view setup with\n70 synchronized RGB cameras, along with wearable motion capture devices\nincluding an IMU-based body suit and hand motion capture gloves. By leveraging\nthe ParaHome system, we collect a new human-object interaction dataset,\nincluding 486 minutes of sequences across 207 captures with 38 participants,\noffering advancements with three key aspects: (1) capturing body motion and\ndexterous hand manipulation motion alongside multiple objects within a\ncontextual home environment; (2) encompassing sequential and concurrent\nmanipulations paired with text descriptions; and (3) including articulated\nobjects with multiple parts represented by 3D parameterized models. We present\ndetailed design justifications for our system, and perform key generative\nmodeling experiments to demonstrate the potential of our dataset.\n","authors":["Jeonghwan Kim","Jisoo Kim","Jeonghyeon Na","Hanbyul Joo"],"pdf_url":"https://arxiv.org/pdf/2401.10232v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12681v1","updated":"2025-01-22T06:59:46Z","published":"2025-01-22T06:59:46Z","title":"Can masking background and object reduce static bias for zero-shot\n action recognition?","summary":" In this paper, we address the issue of static bias in zero-shot action\nrecognition. Action recognition models need to represent the action itself, not\nthe appearance. However, some fully-supervised works show that models often\nrely on static appearances, such as the background and objects, rather than\nhuman actions. This issue, known as static bias, has not been investigated for\nzero-shot. Although CLIP-based zero-shot models are now common, it remains\nunclear if they sufficiently focus on human actions, as CLIP primarily captures\nappearance features related to languages. In this paper, we investigate the\ninfluence of static bias in zero-shot action recognition with CLIP-based\nmodels. Our approach involves masking backgrounds, objects, and people\ndifferently during training and validation. Experiments with masking background\nshow that models depend on background bias as their performance decreases for\nKinetics400. However, for Mimetics, which has a weak background bias, masking\nthe background leads to improved performance even if the background is masked\nduring validation. Furthermore, masking both the background and objects in\ndifferent colors improves performance for SSv2, which has a strong object bias.\nThese results suggest that masking the background or objects during training\nprevents models from overly depending on static bias and makes them focus more\non human action.\n","authors":["Takumi Fukuzawa","Kensho Hara","Hirokatsu Kataoka","Toru Tamaki"],"pdf_url":"https://arxiv.org/pdf/2501.12681v1.pdf","comment":"In proc. of MMM2025"},{"id":"http://arxiv.org/abs/2407.12317v3","updated":"2025-01-22T06:32:02Z","published":"2024-07-17T05:02:17Z","title":"Out of Length Text Recognition with Sub-String Matching","summary":" Scene Text Recognition (STR) methods have demonstrated robust performance in\nword-level text recognition. However, in real applications the text image is\nsometimes long due to detected with multiple horizontal words. It triggers the\nrequirement to build long text recognition models from readily available short\n(i.e., word-level) text datasets, which has been less studied previously. In\nthis paper, we term this task Out of Length (OOL) text recognition. We\nestablish the first Long Text Benchmark (LTB) to facilitate the assessment of\ndifferent methods in long text recognition. Meanwhile, we propose a novel\nmethod called OOL Text Recognition with sub-String Matching (SMTR). SMTR\ncomprises two cross-attention-based modules: one encodes a sub-string\ncontaining multiple characters into next and previous queries, and the other\nemploys the queries to attend to the image features, matching the sub-string\nand simultaneously recognizing its next and previous character. SMTR can\nrecognize text of arbitrary length by iterating the process above. To avoid\nbeing trapped in recognizing highly similar sub-strings, we introduce a\nregularization training to compel SMTR to effectively discover subtle\ndifferences between similar sub-strings for precise matching. In addition, we\npropose an inference augmentation strategy to alleviate confusion caused by\nidentical sub-strings in the same text and improve the overall recognition\nefficiency. Extensive experimental results reveal that SMTR, even when trained\nexclusively on short text, outperforms existing methods in public short text\nbenchmarks and exhibits a clear advantage on LTB. Code:\nhttps://github.com/Topdu/OpenOCR.\n","authors":["Yongkun Du","Zhineng Chen","Caiyan Jia","Xieping Gao","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.12317v3.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2501.12666v1","updated":"2025-01-22T06:03:16Z","published":"2025-01-22T06:03:16Z","title":"Explicit Eigenvalue Regularization Improves Sharpness-Aware Minimization","summary":" Sharpness-Aware Minimization (SAM) has attracted significant attention for\nits effectiveness in improving generalization across various tasks. However,\nits underlying principles remain poorly understood. In this work, we analyze\nSAM's training dynamics using the maximum eigenvalue of the Hessian as a\nmeasure of sharpness, and propose a third-order stochastic differential\nequation (SDE), which reveals that the dynamics are driven by a complex mixture\nof second- and third-order terms. We show that alignment between the\nperturbation vector and the top eigenvector is crucial for SAM's effectiveness\nin regularizing sharpness, but find that this alignment is often inadequate in\npractice, limiting SAM's efficiency. Building on these insights, we introduce\nEigen-SAM, an algorithm that explicitly aims to regularize the top Hessian\neigenvalue by aligning the perturbation vector with the leading eigenvector. We\nvalidate the effectiveness of our theory and the practical advantages of our\nproposed approach through comprehensive experiments. Code is available at\nhttps://github.com/RitianLuo/EigenSAM.\n","authors":["Haocheng Luo","Tuan Truong","Tung Pham","Mehrtash Harandi","Dinh Phung","Trung Le"],"pdf_url":"https://arxiv.org/pdf/2501.12666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12255v2","updated":"2025-01-22T06:00:26Z","published":"2025-01-21T16:23:05Z","title":"HAC++: Towards 100X Compression of 3D Gaussian Splatting","summary":" 3D Gaussian Splatting (3DGS) has emerged as a promising framework for novel\nview synthesis, boasting rapid rendering speed with high fidelity. However, the\nsubstantial Gaussians and their associated attributes necessitate effective\ncompression techniques. Nevertheless, the sparse and unorganized nature of the\npoint cloud of Gaussians (or anchors in our paper) presents challenges for\ncompression. To achieve a compact size, we propose HAC++, which leverages the\nrelationships between unorganized anchors and a structured hash grid, utilizing\ntheir mutual information for context modeling. Additionally, HAC++ captures\nintra-anchor contextual relationships to further enhance compression\nperformance. To facilitate entropy coding, we utilize Gaussian distributions to\nprecisely estimate the probability of each quantized attribute, where an\nadaptive quantization module is proposed to enable high-precision quantization\nof these attributes for improved fidelity restoration. Moreover, we incorporate\nan adaptive masking strategy to eliminate invalid Gaussians and anchors.\nOverall, HAC++ achieves a remarkable size reduction of over 100X compared to\nvanilla 3DGS when averaged on all datasets, while simultaneously improving\nfidelity. It also delivers more than 20X size reduction compared to\nScaffold-GS. Our code is available at\nhttps://github.com/YihangChen-ee/HAC-plus.\n","authors":["Yihang Chen","Qianyi Wu","Weiyao Lin","Mehrtash Harandi","Jianfei Cai"],"pdf_url":"https://arxiv.org/pdf/2501.12255v2.pdf","comment":"Project Page: https://yihangchen-ee.github.io/project_hac++/ Code:\n https://github.com/YihangChen-ee/HAC-plus. This paper is a journal extension\n of HAC at arXiv:2403.14530 (ECCV 2024)"},{"id":"http://arxiv.org/abs/2403.15048v3","updated":"2025-01-22T05:46:56Z","published":"2024-03-22T09:13:09Z","title":"Make VLM Recognize Visual Hallucination on Cartoon Character Image with\n Pose Information","summary":" Leveraging large-scale Text-to-Image (TTI) models have become a common\ntechnique for generating exemplar or training dataset in the fields of image\nsynthesis, video editing, 3D reconstruction. However, semantic structural\nvisual hallucinations involving perceptually severe defects remain a concern,\nespecially in the domain of non-photorealistic rendering (NPR) such as cartoons\nand pixelization-style character. To detect these hallucinations in NPR, We\npropose a novel semantic structural hallucination detection system using\nVision-Language Model (VLM). Our approach is to leverage the emerging\ncapability of large language model, in-context learning which denotes that VLM\nhas seen some examples by user for specific downstream task, here hallucination\ndetection. Based on in-context learning, we introduce pose-aware in-context\nvisual learning (PA-ICVL) which improve the overall performance of VLM by\nfurther inputting visual data beyond prompts, RGB images and pose information.\nBy incorporating pose guidance, we enable VLMs to make more accurate decisions.\nExperimental results demonstrate significant improvements in identifying visual\nhallucinations compared to baseline methods relying solely on RGB images.\nWithin selected two VLMs, GPT-4v, Gemini pro vision, our proposed PA-ICVL\nimproves the hallucination detection with 50% to 78%, 57% to 80%, respectively.\nThis research advances a capability of TTI models toward real-world\napplications by mitigating visual hallucinations via in-context visual\nlearning, expanding their potential in non-photorealistic domains. In addition,\nit showcase how users can boost the downstream-specialized capability of open\nVLM by harnessing additional conditions. We collect synthetic\ncartoon-hallucination dataset with TTI models, this dataset and final tuned VLM\nwill be publicly available.\n","authors":["Bumsoo Kim","Wonseop Shin","Kyuchul Lee","Yonghoon Jung","Sanghyun Seo"],"pdf_url":"https://arxiv.org/pdf/2403.15048v3.pdf","comment":"Accepted at WACV 2025, Project page:\n https://gh-bumsookim.github.io/Cartoon-Hallucinations-Detection/"},{"id":"http://arxiv.org/abs/2501.12390v2","updated":"2025-01-22T05:07:28Z","published":"2025-01-21T18:59:46Z","title":"GPS as a Control Signal for Image Generation","summary":" We show that the GPS tags contained in photo metadata provide a useful\ncontrol signal for image generation. We train GPS-to-image models and use them\nfor tasks that require a fine-grained understanding of how images vary within a\ncity. In particular, we train a diffusion model to generate images conditioned\non both GPS and text. The learned model generates images that capture the\ndistinctive appearance of different neighborhoods, parks, and landmarks. We\nalso extract 3D models from 2D GPS-to-image models through score distillation\nsampling, using GPS conditioning to constrain the appearance of the\nreconstruction from each viewpoint. Our evaluations suggest that our\nGPS-conditioned models successfully learn to generate images that vary based on\nlocation, and that GPS conditioning improves estimated 3D structure.\n","authors":["Chao Feng","Ziyang Chen","Aleksander Holynski","Alexei A. Efros","Andrew Owens"],"pdf_url":"https://arxiv.org/pdf/2501.12390v2.pdf","comment":"Project page: https://cfeng16.github.io/gps-gen/"},{"id":"http://arxiv.org/abs/2501.12637v1","updated":"2025-01-22T04:53:12Z","published":"2025-01-22T04:53:12Z","title":"DWTNeRF: Boosting Few-shot Neural Radiance Fields via Discrete Wavelet\n Transform","summary":" Neural Radiance Fields (NeRF) has achieved superior performance in novel view\nsynthesis and 3D scene representation, but its practical applications are\nhindered by slow convergence and reliance on dense training views. To this end,\nwe present DWTNeRF, a unified framework based on Instant-NGP's fast-training\nhash encoding. It is coupled with regularization terms designed for few-shot\nNeRF, which operates on sparse training views. Our DWTNeRF includes a novel\nDiscrete Wavelet loss that allows explicit prioritization of low frequencies\ndirectly in the training objective, reducing few-shot NeRF's overfitting on\nhigh frequencies in earlier training stages. We additionally introduce a\nmodel-based approach, based on multi-head attention, that is compatible with\nINGP-based models, which are sensitive to architectural changes. On the 3-shot\nLLFF benchmark, DWTNeRF outperforms Vanilla NeRF by 15.07% in PSNR, 24.45% in\nSSIM and 36.30% in LPIPS. Our approach encourages a re-thinking of current\nfew-shot approaches for INGP-based models.\n","authors":["Hung Nguyen","Blark Runfa Li","Truong Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.12637v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.12635v1","updated":"2025-01-22T04:43:21Z","published":"2025-01-22T04:43:21Z","title":"Multiple Queries with Multiple Keys: A Precise Prompt Matching Paradigm\n for Prompt-based Continual Learning","summary":" Continual learning requires machine learning models to continuously acquire\nnew knowledge in dynamic environments while avoiding the forgetting of previous\nknowledge. Prompt-based continual learning methods effectively address the\nissue of catastrophic forgetting through prompt expansion and selection.\nHowever, existing approaches often suffer from low accuracy in prompt\nselection, which can result in the model receiving biased knowledge and making\nbiased predictions. To address this issue, we propose the Multiple Queries with\nMultiple Keys (MQMK) prompt matching paradigm for precise prompt selection. The\ngoal of MQMK is to select the prompts whose training data distribution most\nclosely matches that of the test sample. Specifically, Multiple Queries enable\nprecise breadth search by introducing task-specific knowledge, while Multiple\nKeys perform deep search by representing the feature distribution of training\nsamples at a fine-grained level. Experiments show that MQMK enhances the prompt\nmatching rate by over 30% in challenging scenarios and achieves\nstate-of-the-art performance on three widely adopted continual learning\nbenchmarks. Once this paper is accepted, we will release the code.\n","authors":["Dunwei Tu","Huiyu Yi","Yuchi Wang","Baile Xu","Jian Zhao","Furao Shen"],"pdf_url":"https://arxiv.org/pdf/2501.12635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12632v1","updated":"2025-01-22T04:36:17Z","published":"2025-01-22T04:36:17Z","title":"TeD-Loc: Text Distillation for Weakly Supervised Object Localization","summary":" Weakly supervised object localization (WSOL) using classification models\ntrained with only image-class labels remains an important challenge in computer\nvision. Given their reliance on classification objectives, traditional WSOL\nmethods like class activation mapping focus on the most discriminative object\nparts, often missing the full spatial extent. In contrast, recent WSOL methods\nbased on vision-language models like CLIP require ground truth classes or\nexternal classifiers to produce a localization map, limiting their deployment\nin downstream tasks. Moreover, methods like GenPromp attempt to address these\nissues but introduce considerable complexity due to their reliance on\nconditional denoising processes and intricate prompt learning. This paper\nintroduces Text Distillation for Localization (TeD-Loc), an approach that\ndirectly distills knowledge from CLIP text embeddings into the model backbone\nand produces patch-level localization. Multiple instance learning of these\nimage patches allows for accurate localization and classification using one\nmodel without requiring external classifiers. Such integration of textual and\nvisual modalities addresses the longstanding challenge of achieving accurate\nlocalization and classification concurrently, as WSOL methods in the literature\ntypically converge at different epochs. Extensive experiments show that\nleveraging text embeddings and localization cues provides a cost-effective WSOL\nmodel. TeD-Loc improves Top-1 LOC accuracy over state-of-the-art models by\nabout 5% on both CUB and ILSVRC datasets, while significantly reducing\ncomputational complexity compared to GenPromp.\n","authors":["Shakeeb Murtaza","Soufiane Belharbi","Marco Pedersoli","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2501.12632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12104v2","updated":"2025-01-22T04:21:42Z","published":"2025-01-21T12:55:04Z","title":"Teacher Encoder-Student Decoder Denoising Guided Segmentation Network\n for Anomaly Detection","summary":" Visual anomaly detection is a highly challenging task, often categorized as a\none-class classification and segmentation problem. Recent studies have\ndemonstrated that the student-teacher (S-T) framework effectively addresses\nthis challenge. However, most S-T frameworks rely solely on pre-trained teacher\nnetworks to guide student networks in learning multi-scale similar features,\noverlooking the potential of the student networks to enhance learning through\nmulti-scale feature fusion. In this study, we propose a novel model named\nPFADSeg, which integrates a pre-trained teacher network, a denoising student\nnetwork with multi-scale feature fusion, and a guided anomaly segmentation\nnetwork into a unified framework. By adopting a unique teacher-encoder and\nstudent-decoder denoising mode, the model improves the student network's\nability to learn from teacher network features. Furthermore, an adaptive\nfeature fusion mechanism is introduced to train a self-supervised segmentation\nnetwork that synthesizes anomaly masks autonomously, significantly increasing\ndetection performance. Evaluated on the MVTec AD dataset, PFADSeg achieves\nstate-of-the-art results with an image-level AUC of 98.9%, a pixel-level mean\nprecision of 76.4%, and an instance-level mean precision of 78.7%.\n","authors":["Shixuan Song","Hao Chen","Shu Hu","Xin Wang","Jinrong Hu","Xi Wu"],"pdf_url":"https://arxiv.org/pdf/2501.12104v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04410v2","updated":"2025-01-22T04:10:54Z","published":"2024-09-06T17:14:53Z","title":"Open-MAGVIT2: An Open-Source Project Toward Democratizing\n Auto-regressive Visual Generation","summary":" We present Open-MAGVIT2, a family of auto-regressive image generation models\nranging from 300M to 1.5B. The Open-MAGVIT2 project produces an open-source\nreplication of Google's MAGVIT-v2 tokenizer, a tokenizer with a super-large\ncodebook (i.e., $2^{18}$ codes), and achieves the state-of-the-art\nreconstruction performance (1.17 rFID) on ImageNet $256 \\times 256$.\nFurthermore, we explore its application in plain auto-regressive models and\nvalidate scalability properties. To assist auto-regressive models in predicting\nwith a super-large vocabulary, we factorize it into two sub-vocabulary of\ndifferent sizes by asymmetric token factorization, and further introduce \"next\nsub-token prediction\" to enhance sub-token interaction for better generation\nquality. We release all models and codes to foster innovation and creativity in\nthe field of auto-regressive visual generation.\n","authors":["Zhuoyan Luo","Fengyuan Shi","Yixiao Ge","Yujiu Yang","Limin Wang","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2409.04410v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23623v2","updated":"2025-01-22T04:10:31Z","published":"2024-10-31T04:20:47Z","title":"On Learning Multi-Modal Forgery Representation for Diffusion Generated\n Video Detection","summary":" Large numbers of synthesized videos from diffusion models pose threats to\ninformation security and authenticity, leading to an increasing demand for\ngenerated content detection. However, existing video-level detection algorithms\nprimarily focus on detecting facial forgeries and often fail to identify\ndiffusion-generated content with a diverse range of semantics. To advance the\nfield of video forensics, we propose an innovative algorithm named Multi-Modal\nDetection(MM-Det) for detecting diffusion-generated videos. MM-Det utilizes the\nprofound perceptual and comprehensive abilities of Large Multi-modal Models\n(LMMs) by generating a Multi-Modal Forgery Representation (MMFR) from LMM's\nmulti-modal space, enhancing its ability to detect unseen forgery content.\nBesides, MM-Det leverages an In-and-Across Frame Attention (IAFA) mechanism for\nfeature augmentation in the spatio-temporal domain. A dynamic fusion strategy\nhelps refine forgery representations for the fusion. Moreover, we construct a\ncomprehensive diffusion video dataset, called Diffusion Video Forensics (DVF),\nacross a wide range of forgery videos. MM-Det achieves state-of-the-art\nperformance in DVF, demonstrating the effectiveness of our algorithm. Both\nsource code and DVF are available at https://github.com/SparkleXFantasy/MM-Det.\n","authors":["Xiufeng Song","Xiao Guo","Jiache Zhang","Qirui Li","Lei Bai","Xiaoming Liu","Guangtao Zhai","Xiaohong Liu"],"pdf_url":"https://arxiv.org/pdf/2410.23623v2.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2405.09365v5","updated":"2025-01-22T04:06:29Z","published":"2024-05-15T14:17:44Z","title":"SARATR-X: Toward Building A Foundation Model for SAR Target Recognition","summary":" Despite the remarkable progress in synthetic aperture radar automatic target\nrecognition (SAR ATR), recent efforts have concentrated on detecting and\nclassifying a specific category, e.g., vehicles, ships, airplanes, or\nbuildings. One of the fundamental limitations of the top-performing SAR ATR\nmethods is that the learning paradigm is supervised, task-specific,\nlimited-category, closed-world learning, which depends on massive amounts of\naccurately annotated samples that are expensively labeled by expert SAR\nanalysts and have limited generalization capability and scalability. In this\nwork, we make the first attempt towards building a foundation model for SAR\nATR, termed SARATR-X. SARATR-X learns generalizable representations via\nself-supervised learning (SSL) and provides a cornerstone for label-efficient\nmodel adaptation to generic SAR target detection and classification tasks.\nSpecifically, SARATR-X is trained on 0.18 M unlabelled SAR target samples,\nwhich are curated by combining contemporary benchmarks and constitute the\nlargest publicly available dataset till now. Considering the characteristics of\nSAR images, a backbone tailored for SAR ATR is carefully designed, and a\ntwo-step SSL method endowed with multi-scale gradient features was applied to\nensure the feature diversity and model scalability of SARATR-X. The\ncapabilities of SARATR-X are evaluated on classification under few-shot and\nrobustness settings and detection across various categories and scenes, and\nimpressive performance is achieved, often competitive with or even superior to\nprior fully supervised, semi-supervised, or self-supervised algorithms. Our\nSARATR-X and the curated dataset are released at\nhttps://github.com/waterdisappear/SARATR-X to foster research into foundation\nmodels for SAR image interpretation.\n","authors":["Weijie Li","Wei Yang","Yuenan Hou","Li Liu","Yongxiang Liu","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2405.09365v5.pdf","comment":"20 pages, 9 figures"},{"id":"http://arxiv.org/abs/2210.13984v5","updated":"2025-01-22T04:02:30Z","published":"2022-10-24T07:43:59Z","title":"Inferring Past Human Actions in Homes with Abductive Reasoning","summary":" Abductive reasoning aims to make the most likely inference for a given set of\nincomplete observations. In this paper, we introduce \"Abductive Past Action\nInference\", a novel research task aimed at identifying the past actions\nperformed by individuals within homes to reach specific states captured in a\nsingle image, using abductive inference. The research explores three key\nabductive inference problems: past action set prediction, past action sequence\nprediction, and abductive past action verification. We introduce several models\ntailored for abductive past action inference, including a relational graph\nneural network, a relational bilinear pooling model, and a relational\ntransformer model. Notably, the newly proposed object-relational bilinear graph\nencoder-decoder (BiGED) model emerges as the most effective among all methods\nevaluated, demonstrating good proficiency in handling the intricacies of the\nAction Genome dataset. The contributions of this research significantly advance\nthe ability of deep learning models to reason about current scene evidence and\nmake highly plausible inferences about past human actions. This advancement\nenables a deeper understanding of events and behaviors, which can enhance\ndecision-making and improve system capabilities across various real-world\napplications such as Human-Robot Interaction and Elderly Care and Health\nMonitoring. Code and data available at https://github.com/LUNAProject22/AAR\n","authors":["Clement Tan","Chai Kiat Yeo","Cheston Tan","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2210.13984v5.pdf","comment":"15 pages, 8 figures, Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2405.08300v3","updated":"2025-01-22T03:19:36Z","published":"2024-05-14T03:50:07Z","title":"Vector-Symbolic Architecture for Event-Based Optical Flow","summary":" From a perspective of feature matching, optical flow estimation for event\ncameras involves identifying event correspondences by comparing feature\nsimilarity across accompanying event frames. In this work, we introduces an\neffective and robust high-dimensional (HD) feature descriptor for event frames,\nutilizing Vector Symbolic Architectures (VSA). The topological similarity among\nneighboring variables within VSA contributes to the enhanced representation\nsimilarity of feature descriptors for flow-matching points, while its\nstructured symbolic representation capacity facilitates feature fusion from\nboth event polarities and multiple spatial scales. Based on this HD feature\ndescriptor, we propose a novel feature matching framework for event-based\noptical flow, encompassing both model-based (VSA-Flow) and self-supervised\nlearning (VSA-SM) methods. In VSA-Flow, accurate optical flow estimation\nvalidates the effectiveness of HD feature descriptors. In VSA-SM, a novel\nsimilarity maximization method based on the HD feature descriptor is proposed\nto learn optical flow in a self-supervised way from events alone, eliminating\nthe need for auxiliary grayscale images. Evaluation results demonstrate that\nour VSA-based method achieves superior accuracy in comparison to both\nmodel-based and self-supervised learning methods on the DSEC benchmark, while\nremains competitive among both methods on the MVSEC benchmark. This\ncontribution marks a significant advancement in event-based optical flow within\nthe feature matching methodology.\n","authors":["Hongzhi You","Yijun Cao","Wei Yuan","Fanjun Wang","Ning Qiao","Yongjie Li"],"pdf_url":"https://arxiv.org/pdf/2405.08300v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11815v2","updated":"2025-01-22T03:17:01Z","published":"2025-01-21T01:45:56Z","title":"CogMorph: Cognitive Morphing Attacks for Text-to-Image Models","summary":" The development of text-to-image (T2I) generative models, that enable the\ncreation of high-quality synthetic images from textual prompts, has opened new\nfrontiers in creative design and content generation. However, this paper\nreveals a significant and previously unrecognized ethical risk inherent in this\ntechnology and introduces a novel method, termed the Cognitive Morphing Attack\n(CogMorph), which manipulates T2I models to generate images that retain the\noriginal core subjects but embeds toxic or harmful contextual elements. This\nnuanced manipulation exploits the cognitive principle that human perception of\nconcepts is shaped by the entire visual scene and its context, producing images\nthat amplify emotional harm far beyond attacks that merely preserve the\noriginal semantics. To address this, we first construct an imagery toxicity\ntaxonomy spanning 10 major and 48 sub-categories, aligned with human\ncognitive-perceptual dimensions, and further build a toxicity risk matrix\nresulting in 1,176 high-quality T2I toxic prompts. Based on this, our CogMorph\nfirst introduces Cognitive Toxicity Augmentation, which develops a cognitive\ntoxicity knowledge base with rich external toxic representations for humans\n(e.g., fine-grained visual features) that can be utilized to further guide the\noptimization of adversarial prompts. In addition, we present Contextual\nHierarchical Morphing, which hierarchically extracts critical parts of the\noriginal prompt (e.g., scenes, subjects, and body parts), and then iteratively\nretrieves and fuses toxic features to inject harmful contexts. Extensive\nexperiments on multiple open-sourced T2I models and black-box commercial APIs\n(e.g., DALLE-3) demonstrate the efficacy of CogMorph which significantly\noutperforms other baselines by large margins (+20.62% on average).\n","authors":["Zonglei Jing","Zonghao Ying","Le Wang","Siyuan Liang","Aishan Liu","Xianglong Liu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2501.11815v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12604v1","updated":"2025-01-22T03:01:54Z","published":"2025-01-22T03:01:54Z","title":"Image Motion Blur Removal in the Temporal Dimension with Video Diffusion\n Models","summary":" Most motion deblurring algorithms rely on spatial-domain convolution models,\nwhich struggle with the complex, non-linear blur arising from camera shake and\nobject motion. In contrast, we propose a novel single-image deblurring approach\nthat treats motion blur as a temporal averaging phenomenon. Our core innovation\nlies in leveraging a pre-trained video diffusion transformer model to capture\ndiverse motion dynamics within a latent space. It sidesteps explicit kernel\nestimation and effectively accommodates diverse motion patterns. We implement\nthe algorithm within a diffusion-based inverse problem framework. Empirical\nresults on synthetic and real-world datasets demonstrate that our method\noutperforms existing techniques in deblurring complex motion blur scenarios.\nThis work paves the way for utilizing powerful video diffusion models to\naddress single-image deblurring challenges.\n","authors":["Wang Pang","Zhihao Zhan","Xiang Zhu","Yechao Bai"],"pdf_url":"https://arxiv.org/pdf/2501.12604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10891v2","updated":"2025-01-22T02:53:36Z","published":"2025-01-18T22:30:27Z","title":"OpenEarthMap-SAR: A Benchmark Synthetic Aperture Radar Dataset for\n Global High-Resolution Land Cover Mapping","summary":" High-resolution land cover mapping plays a crucial role in addressing a wide\nrange of global challenges, including urban planning, environmental monitoring,\ndisaster response, and sustainable development. However, creating accurate,\nlarge-scale land cover datasets remains a significant challenge due to the\ninherent complexities of geospatial data, such as diverse terrain, varying\nsensor modalities, and atmospheric conditions. Synthetic Aperture Radar (SAR)\nimagery, with its ability to penetrate clouds and capture data in all-weather,\nday-and-night conditions, offers unique advantages for land cover mapping.\nDespite these strengths, the lack of benchmark datasets tailored for SAR\nimagery has limited the development of robust models specifically designed for\nthis data modality. To bridge this gap and facilitate advancements in SAR-based\ngeospatial analysis, we introduce OpenEarthMap-SAR, a benchmark SAR dataset,\nfor global high-resolution land cover mapping. OpenEarthMap-SAR consists of 1.5\nmillion segments of 5033 aerial and satellite images with the size of\n1024$\\times$1024 pixels, covering 35 regions from Japan, France, and the USA,\nwith partially manually annotated and fully pseudo 8-class land cover labels at\na ground sampling distance of 0.15--0.5 m. We evaluated the performance of\nstate-of-the-art methods for semantic segmentation and present challenging\nproblem settings suitable for further technical development. The dataset also\nserves the official dataset for IEEE GRSS Data Fusion Contest Track I. The\ndataset has been made publicly available at\nhttps://zenodo.org/records/14622048.\n","authors":["Junshi Xia","Hongruixuan Chen","Clifford Broni-Bediako","Yimin Wei","Jian Song","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2501.10891v2.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.12596v1","updated":"2025-01-22T02:45:30Z","published":"2025-01-22T02:45:30Z","title":"Adapting OpenAI's CLIP Model for Few-Shot Image Inspection in\n Manufacturing Quality Control: An Expository Case Study with Multiple\n Application Examples","summary":" This expository paper introduces a simplified approach to image-based quality\ninspection in manufacturing using OpenAI's CLIP (Contrastive Language-Image\nPretraining) model adapted for few-shot learning. While CLIP has demonstrated\nimpressive capabilities in general computer vision tasks, its direct\napplication to manufacturing inspection presents challenges due to the domain\ngap between its training data and industrial applications. We evaluate CLIP's\neffectiveness through five case studies: metallic pan surface inspection, 3D\nprinting extrusion profile analysis, stochastic textured surface evaluation,\nautomotive assembly inspection, and microstructure image classification. Our\nresults show that CLIP can achieve high classification accuracy with relatively\nsmall learning sets (50-100 examples per class) for single-component and\ntexture-based applications. However, the performance degrades with complex\nmulti-component scenes. We provide a practical implementation framework that\nenables quality engineers to quickly assess CLIP's suitability for their\nspecific applications before pursuing more complex solutions. This work\nestablishes CLIP-based few-shot learning as an effective baseline approach that\nbalances implementation simplicity with robust performance, demonstrated in\nseveral manufacturing quality control applications.\n","authors":["Fadel M. Megahed","Ying-Ju Chen","Bianca Maria Colosimo","Marco Luigi Giuseppe Grasso","L. Allison Jones-Farmer","Sven Knoth","Hongyue Sun","Inez Zwetsloot"],"pdf_url":"https://arxiv.org/pdf/2501.12596v1.pdf","comment":"31 pages, 13 figures"},{"id":"http://arxiv.org/abs/2311.17425v5","updated":"2025-01-22T02:29:13Z","published":"2023-11-29T07:57:30Z","title":"SpeechAct: Towards Generating Whole-body Motion from Speech","summary":" This paper addresses the problem of generating whole-body motion from speech.\nDespite great successes, prior methods still struggle to produce reasonable and\ndiverse whole-body motions from speech. This is due to their reliance on\nsuboptimal representations and a lack of strategies for generating diverse\nresults. To address these challenges, we present a novel hybrid point\nrepresentation to achieve accurate and continuous motion generation, e.g.,\navoiding foot skating, and this representation can be transformed into an\neasy-to-use representation, i.e., SMPL-X body mesh, for many applications. To\ngenerate whole-body motion from speech, for facial motion, closely tied to the\naudio signal, we introduce an encoder-decoder architecture to achieve\ndeterministic outcomes. However, for the body and hands, which have weaker\nconnections to the audio signal, we aim to generate diverse yet reasonable\nmotions. To boost diversity in motion generation, we propose a contrastive\nmotion learning method to encourage the model to produce more distinctive\nrepresentations. Specifically, we design a robust VQ-VAE to learn a quantized\nmotion codebook using our hybrid representation. Then, we regress the motion\nrepresentation from the audio signal by a translation model employing our\ncontrastive motion learning method. Experimental results validate the superior\nperformance and the correctness of our model. The project page is available for\nresearch purposes at http://cic.tju.edu.cn/faculty/likun/projects/SpeechAct.\n","authors":["Jinsong Zhang","Minjie Zhu","Yuxiang Zhang","Yebin Liu","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2311.17425v5.pdf","comment":"Accepted by TVCG"},{"id":"http://arxiv.org/abs/2501.09905v2","updated":"2025-01-22T01:48:31Z","published":"2025-01-17T01:32:18Z","title":"SLIM: Sim-to-Real Legged Instructive Manipulation via Long-Horizon\n Visuomotor Learning","summary":" We present a low-cost legged mobile manipulation system that solves\nlong-horizon real-world tasks, trained by reinforcement learning purely in\nsimulation. This system is made possible by 1) a hierarchical design of a\nhigh-level policy for visual-mobile manipulation following instructions and a\nlow-level policy for quadruped movement and limb control, 2) a progressive\nexploration and learning approach that leverages privileged task decomposition\ninformation to train the teacher policy for long-horizon tasks, which will\nguide an imitation-based student policy for efficient training of the\nhigh-level visuomotor policy, and 3) a suite of techniques for minimizing\nsim-to-real gaps.\n In contrast to previous approaches that use high-end equipment, our system\ndemonstrates effective performance with more accessible hardware -\nspecifically, a Unitree Go1 quadruped, a WidowX250S arm, and a single\nwrist-mounted RGB camera - despite the increased challenges of sim-to-real\ntransfer. When fully trained in simulation, a single policy autonomously solves\nlong-horizon tasks such as search, move, grasp, and drop-into, achieving nearly\n80% success. This performance is comparable to that of expert human\nteleoperation on the same tasks but operates in a more efficient way, at 1.5\ntimes the speed of human expert. The sim-to-real transfer is fluid across\ndiverse indoor and outdoor scenes under varying lighting conditions. Finally,\nwe discuss the key techniques that enable the entire pipeline, including\nefficient RL training and sim-to-real, to work effectively for legged mobile\nmanipulation, and present their ablation results.\n","authors":["Haichao Zhang","Haonan Yu","Le Zhao","Andrew Choi","Qinxun Bai","Break Yang","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2501.09905v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17902v3","updated":"2025-01-22T01:08:28Z","published":"2024-03-26T17:43:15Z","title":"Serpent: Scalable and Efficient Image Restoration via Multi-scale\n Structured State Space Models","summary":" The landscape of computational building blocks of efficient image restoration\narchitectures is dominated by a combination of convolutional processing and\nvarious attention mechanisms. However, convolutional filters, while efficient,\nare inherently local and therefore struggle with modeling long-range\ndependencies in images. In contrast, attention excels at capturing global\ninteractions between arbitrary image regions, but suffers from a quadratic cost\nin image dimension. In this work, we propose Serpent, an efficient architecture\nfor high-resolution image restoration that combines recent advances in state\nspace models (SSMs) with multi-scale signal processing in its core\ncomputational block. SSMs, originally introduced for sequence modeling, can\nmaintain a global receptive field with a favorable linear scaling in input\nsize. We propose a novel hierarchical architecture inspired by traditional\nsignal processing principles, that converts the input image into a collection\nof sequences and processes them in a multi-scale fashion. Our experimental\nresults demonstrate that Serpent can achieve reconstruction quality on par with\nstate-of-the-art techniques, while requiring orders of magnitude less compute\n(up to $150$ fold reduction in FLOPS) and a factor of up to $5\\times$ less GPU\nmemory while maintaining a compact model size. The efficiency gains achieved by\nSerpent are especially notable at high image resolutions.\n","authors":["Mohammad Shahab Sepehri","Zalan Fabian","Mahdi Soltanolkotabi"],"pdf_url":"https://arxiv.org/pdf/2403.17902v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12553v1","updated":"2025-01-22T00:17:08Z","published":"2025-01-22T00:17:08Z","title":"ViDDAR: Vision Language Model-Based Task-Detrimental Content Detection\n for Augmented Reality","summary":" In Augmented Reality (AR), virtual content enhances user experience by\nproviding additional information. However, improperly positioned or designed\nvirtual content can be detrimental to task performance, as it can impair users'\nability to accurately interpret real-world information. In this paper we\nexamine two types of task-detrimental virtual content: obstruction attacks, in\nwhich virtual content prevents users from seeing real-world objects, and\ninformation manipulation attacks, in which virtual content interferes with\nusers' ability to accurately interpret real-world information. We provide a\nmathematical framework to characterize these attacks and create a custom\nopen-source dataset for attack evaluation. To address these attacks, we\nintroduce ViDDAR (Vision language model-based Task-Detrimental content Detector\nfor Augmented Reality), a comprehensive full-reference system that leverages\nVision Language Models (VLMs) and advanced deep learning techniques to monitor\nand evaluate virtual content in AR environments, employing a user-edge-cloud\narchitecture to balance performance with low latency. To the best of our\nknowledge, ViDDAR is the first system to employ VLMs for detecting\ntask-detrimental content in AR settings. Our evaluation results demonstrate\nthat ViDDAR effectively understands complex scenes and detects task-detrimental\ncontent, achieving up to 92.15% obstruction detection accuracy with a detection\nlatency of 533 ms, and an 82.46% information manipulation content detection\naccuracy with a latency of 9.62 s.\n","authors":["Yanming Xiu","Tim Scargill","Maria Gorlatova"],"pdf_url":"https://arxiv.org/pdf/2501.12553v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2412.03378v2","updated":"2025-01-22T00:06:08Z","published":"2024-12-04T15:05:43Z","title":"Volumetrically Consistent 3D Gaussian Rasterization","summary":" Recently, 3D Gaussian Splatting (3DGS) has enabled photorealistic view\nsynthesis at high inference speeds. However, its splatting-based rendering\nmodel makes several approximations to the rendering equation, reducing physical\naccuracy. We show that splatting and its approximations are unnecessary, even\nwithin a rasterizer; we instead volumetrically integrate 3D Gaussians directly\nto compute the transmittance across them analytically. We use this analytic\ntransmittance to derive more physically-accurate alpha values than 3DGS, which\ncan directly be used within their framework. The result is a method that more\nclosely follows the volume rendering equation (similar to ray-tracing) while\nenjoying the speed benefits of rasterization. Our method represents opaque\nsurfaces with higher accuracy and fewer points than 3DGS. This enables it to\noutperform 3DGS for view synthesis (measured in SSIM and LPIPS). Being\nvolumetrically consistent also enables our method to work out of the box for\ntomography. We match the state-of-the-art 3DGS-based tomography method with\nfewer points.\n","authors":["Chinmay Talegaonkar","Yash Belhe","Ravi Ramamoorthi","Nicholas Antipa"],"pdf_url":"https://arxiv.org/pdf/2412.03378v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08923v2","updated":"2025-01-22T23:58:03Z","published":"2024-11-12T08:14:54Z","title":"Aligning Visual Contrastive learning models via Preference Optimization","summary":" Contrastive learning models have demonstrated impressive abilities to capture\nsemantic similarities by aligning representations in the embedding space.\nHowever, their performance can be limited by the quality of the training data\nand its inherent biases. While Reinforcement Learning from Human Feedback\n(RLHF) and Direct Preference Optimization (DPO) have been applied to generative\nmodels to align them with human preferences, their use in contrastive learning\nhas yet to be explored. This paper introduces a novel method for training\ncontrastive learning models using Preference Optimization (PO) to break down\ncomplex concepts. Our method systematically aligns model behavior with desired\npreferences, enhancing performance on the targeted task. In particular, we\nfocus on enhancing model robustness against typographic attacks, commonly seen\nin contrastive models like CLIP. We further apply our method to disentangle\ngender understanding and mitigate gender biases, offering a more nuanced\ncontrol over these sensitive attributes. Our experiments demonstrate that\nmodels trained using PO outperform standard contrastive learning techniques\nwhile retaining their ability to handle adversarial challenges and maintain\naccuracy on other downstream tasks. This makes our method well-suited for tasks\nrequiring fairness, robustness, and alignment with specific preferences. We\nevaluate our method on several vision-language tasks, tackling challenges such\nas typographic attacks. Additionally, we explore the model's ability to\ndisentangle gender concepts and mitigate gender bias, showcasing the\nversatility of our approach.\n","authors":["Amirabbas Afzali","Borna Khodabandeh","Ali Rasekh","Mahyar JafariNodeh","Sepehr kazemi","Simon Gottschalk"],"pdf_url":"https://arxiv.org/pdf/2411.08923v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13277v1","updated":"2025-01-22T23:56:37Z","published":"2025-01-22T23:56:37Z","title":"MEDFORM: A Foundation Model for Contrastive Learning of CT Imaging and\n Clinical Numeric Data in Multi-Cancer Analysis","summary":" Computed tomography (CT) and clinical numeric data are essential modalities\nfor cancer evaluation, but building large-scale multimodal training datasets\nfor developing medical foundation models remains challenging due to the\nstructural complexity of multi-slice CT data and high cost of expert\nannotation. In this study, we propose MEDFORM, a multimodal pre-training\nstrategy that guides CT image representation learning using complementary\ninformation from clinical data for medical foundation model development.\nMEDFORM efficiently processes CT slice through multiple instance learning (MIL)\nand adopts a dual pre-training strategy: first pretraining the CT slice feature\nextractor using SimCLR-based self-supervised learning, then aligning CT and\nclinical modalities through cross-modal contrastive learning. Our model was\npre-trained on three different cancer types: lung cancer (141,171 slices),\nbreast cancer (8,100 slices), colorectal cancer (10,393 slices). The\nexperimental results demonstrated that this dual pre-training strategy improves\ncancer classification performance and maintains robust performance in few-shot\nlearning scenarios. Code available at\nhttps://github.com/DigitalHealthcareLab/25MultiModalFoundationModel.git\n","authors":["Daeun Jung","Jaehyeok Jang","Sooyoung Jang","Yu Rang Park"],"pdf_url":"https://arxiv.org/pdf/2501.13277v1.pdf","comment":"8 pages, 1 figure"},{"id":"http://arxiv.org/abs/2406.14862v5","updated":"2025-01-22T22:56:04Z","published":"2024-06-21T04:39:03Z","title":"LatentExplainer: Explaining Latent Representations in Deep Generative\n Models with Multimodal Large Language Models","summary":" Deep generative models like VAEs and diffusion models have advanced various\ngeneration tasks by leveraging latent variables to learn data distributions and\ngenerate high-quality samples. Despite the field of explainable AI making\nstrides in interpreting machine learning models, understanding latent variables\nin generative models remains challenging. This paper introduces\n\\textit{LatentExplainer}, a framework for automatically generating semantically\nmeaningful explanations of latent variables in deep generative models.\n\\textit{LatentExplainer} tackles three main challenges: inferring the meaning\nof latent variables, aligning explanations with inductive biases, and handling\nvarying degrees of explainability. Our approach perturbs latent variables,\ninterpreting changes in generated data, and uses multi-modal large language\nmodels (MLLMs) to produce human-understandable explanations. We evaluate our\nproposed method on several real-world and synthetic datasets, and the results\ndemonstrate superior performance in generating high-quality explanations for\nlatent variables. The results highlight the effectiveness of incorporating\ninductive biases and uncertainty quantification, significantly enhancing model\ninterpretability.\n","authors":["Mengdan Zhu","Raasikh Kanjiani","Jiahui Lu","Andrew Choi","Qirui Ye","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.14862v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12057v2","updated":"2025-01-22T22:52:40Z","published":"2025-01-21T11:27:54Z","title":"Unified 3D MRI Representations via Sequence-Invariant Contrastive\n Learning","summary":" Self-supervised deep learning has accelerated 2D natural image analysis but\nremains difficult to translate into 3D MRI, where data are scarce and\npre-trained 2D backbones cannot capture volumetric context. We present a\nsequence-invariant self-supervised framework leveraging quantitative MRI\n(qMRI). By simulating multiple MRI contrasts from a single 3D qMRI scan and\nenforcing consistent representations across these contrasts, we learn\nanatomy-centric rather than sequence-specific features. This yields a robust 3D\nencoder that performs strongly across varied tasks and protocols. Experiments\non healthy brain segmentation (IXI), stroke lesion segmentation (ARC), and MRI\ndenoising show significant gains over baseline SSL approaches, especially in\nlow-data settings (up to +8.3% Dice, +4.2 dB PSNR). Our model also generalises\neffectively to unseen sites, demonstrating potential for more scalable and\nclinically reliable volumetric analysis. All code and trained models are\npublicly available.\n","authors":["Liam Chalcroft","Jenny Crinion","Cathy J. Price","John Ashburner"],"pdf_url":"https://arxiv.org/pdf/2501.12057v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15231v2","updated":"2025-01-22T22:44:26Z","published":"2024-08-27T17:48:29Z","title":"DCT-CryptoNets: Scaling Private Inference in the Frequency Domain","summary":" The convergence of fully homomorphic encryption (FHE) and machine learning\noffers unprecedented opportunities for private inference of sensitive data. FHE\nenables computation directly on encrypted data, safeguarding the entire machine\nlearning pipeline, including data and model confidentiality. However, existing\nFHE-based implementations for deep neural networks face significant challenges\nin computational cost, latency, and scalability, limiting their practical\ndeployment. This paper introduces DCT-CryptoNets, a novel approach that\noperates directly in the frequency-domain to reduce the burden of\ncomputationally expensive non-linear activations and homomorphic bootstrap\noperations during private inference. It does so by utilizing the discrete\ncosine transform (DCT), commonly employed in JPEG encoding, which has inherent\ncompatibility with remote computing services where images are generally stored\nand transmitted in this encoded format. DCT-CryptoNets demonstrates a\nsubstantial latency reductions of up to 5.3$\\times$ compared to prior work on\nbenchmark image classification tasks. Notably, it demonstrates inference on the\nImageNet dataset within 2.5 hours (down from 12.5 hours on equivalent 96-thread\ncompute resources). Furthermore, by learning perceptually salient low-frequency\ninformation DCT-CryptoNets improves the reliability of encrypted predictions\ncompared to RGB-based networks by reducing error accumulating homomorphic\nbootstrap operations. DCT-CryptoNets also demonstrates superior scalability to\nRGB-based networks by further reducing computational cost as image size\nincreases. This study demonstrates a promising avenue for achieving efficient\nand practical private inference of deep learning models on high resolution\nimages seen in real-world applications.\n","authors":["Arjun Roy","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2408.15231v2.pdf","comment":"ICLR 2025; 10 pages content, 5 pages appendix, 4 figures, 9 tables;\n Code @ https://github.com/ar-roy/dct-cryptonets"},{"id":"http://arxiv.org/abs/2307.02698v5","updated":"2025-01-22T22:16:18Z","published":"2023-07-06T00:07:32Z","title":"Dequantization and Color Transfer with Diffusion Models","summary":" We demonstrate an image dequantizing diffusion model that enables novel edits\non natural images. We propose operating on quantized images because they offer\neasy abstraction for patch-based edits and palette transfer. In particular, we\nshow that color palettes can make the output of the diffusion model easier to\ncontrol and interpret. We first establish that existing image restoration\nmethods are not sufficient, such as JPEG noise reduction models. We then\ndemonstrate that our model can generate natural images that respect the color\npalette the user asked for. For palette transfer, we propose a method based on\nweighted bipartite matching. We then show that our model generates plausible\nimages even after extreme palette transfers, respecting user query. Our method\ncan optionally condition on the source texture in part or all of the image. In\ndoing so, we overcome a common problem in existing image colorization methods\nthat are unable to produce colors with a different luminance than the input. We\nevaluate several possibilities for texture conditioning and their trade-offs,\nincluding luminance, image gradients, and thresholded gradients, the latter of\nwhich performed best in maintaining texture and color control simultaneously.\nOur method can be usefully extended to another practical edit: recoloring\npatches of an image while respecting the source texture. Our procedure is\nsupported by several qualitative and quantitative evaluations.\n","authors":["Vaibhav Vavilala","Faaris Shaik","David Forsyth"],"pdf_url":"https://arxiv.org/pdf/2307.02698v5.pdf","comment":"WACV 2025 23 pages, 21 figures, 4 tables"},{"id":"http://arxiv.org/abs/2501.13247v1","updated":"2025-01-22T21:58:04Z","published":"2025-01-22T21:58:04Z","title":"Multimodal AI on Wound Images and Clinical Notes for Home Patient\n Referral","summary":" Chronic wounds affect 8.5 million Americans, particularly the elderly and\npatients with diabetes. These wounds can take up to nine months to heal, making\nregular care essential to ensure healing and prevent severe outcomes like limb\namputations. Many patients receive care at home from visiting nurses with\nvarying levels of wound expertise, leading to inconsistent care. Problematic,\nnon-healing wounds should be referred to wound specialists, but referral\ndecisions in non-clinical settings are often erroneous, delayed, or\nunnecessary.\n This paper introduces the Deep Multimodal Wound Assessment Tool (DM-WAT), a\nmachine learning framework designed to assist visiting nurses in deciding\nwhether to refer chronic wound patients. DM-WAT analyzes smartphone-captured\nwound images and clinical notes from Electronic Health Records (EHRs). It uses\nDeiT-Base-Distilled, a Vision Transformer (ViT), to extract visual features\nfrom images and DeBERTa-base to extract text features from clinical notes.\nDM-WAT combines visual and text features using an intermediate fusion approach.\nTo address challenges posed by a small and imbalanced dataset, it integrates\nimage and text augmentation with transfer learning to achieve high performance.\nIn evaluations, DM-WAT achieved 77% with std 3% accuracy and a 70% with std 2%\nF1 score, outperforming prior approaches. Score-CAM and Captum interpretation\nalgorithms provide insights into specific parts of image and text inputs that\ninfluence recommendations, enhancing interpretability and trust.\n","authors":["Reza Saadati Fard","Emmanuel Agu","Palawat Busaranuvong","Deepak Kumar","Shefalika Gautam","Bengisu Tulu","Diane Strong"],"pdf_url":"https://arxiv.org/pdf/2501.13247v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2208.05051 by other authors"},{"id":"http://arxiv.org/abs/2411.17698v3","updated":"2025-01-22T20:03:04Z","published":"2024-11-26T18:59:58Z","title":"Video-Guided Foley Sound Generation with Multimodal Controls","summary":" Generating sound effects for videos often requires creating artistic sound\neffects that diverge significantly from real-life sources and flexible control\nin the sound design. To address this problem, we introduce MultiFoley, a model\ndesigned for video-guided sound generation that supports multimodal\nconditioning through text, audio, and video. Given a silent video and a text\nprompt, MultiFoley allows users to create clean sounds (e.g., skateboard wheels\nspinning without wind noise) or more whimsical sounds (e.g., making a lion's\nroar sound like a cat's meow). MultiFoley also allows users to choose reference\naudio from sound effects (SFX) libraries or partial videos for conditioning. A\nkey novelty of our model lies in its joint training on both internet video\ndatasets with low-quality audio and professional SFX recordings, enabling\nhigh-quality, full-bandwidth (48kHz) audio generation. Through automated\nevaluations and human studies, we demonstrate that MultiFoley successfully\ngenerates synchronized high-quality sounds across varied conditional inputs and\noutperforms existing methods. Please see our project page for video results:\nhttps://ificl.github.io/MultiFoley/\n","authors":["Ziyang Chen","Prem Seetharaman","Bryan Russell","Oriol Nieto","David Bourgin","Andrew Owens","Justin Salamon"],"pdf_url":"https://arxiv.org/pdf/2411.17698v3.pdf","comment":"Project site: https://ificl.github.io/MultiFoley/"},{"id":"http://arxiv.org/abs/2501.13193v1","updated":"2025-01-22T19:50:51Z","published":"2025-01-22T19:50:51Z","title":"Revisiting Data Augmentation for Ultrasound Images","summary":" Data augmentation is a widely used and effective technique to improve the\ngeneralization performance of deep neural networks. Yet, despite often facing\nlimited data availability when working with medical images, it is frequently\nunderutilized. This appears to come from a gap in our collective understanding\nof the efficacy of different augmentation techniques across different tasks and\nmodalities. One modality where this is especially true is ultrasound imaging.\nThis work addresses this gap by analyzing the effectiveness of different\naugmentation techniques at improving model performance across a wide range of\nultrasound image analysis tasks. To achieve this, we introduce a new\nstandardized benchmark of 14 ultrasound image classification and semantic\nsegmentation tasks from 10 different sources and covering 11 body regions. Our\nresults demonstrate that many of the augmentations commonly used for tasks on\nnatural images are also effective on ultrasound images, even more so than\naugmentations developed specifically for ultrasound images in some cases. We\nalso show that diverse augmentation using TrivialAugment, which is widely used\nfor natural images, is also effective for ultrasound images. Moreover, our\nproposed methodology represents a structured approach for assessing various\ndata augmentations that can be applied to other contexts and modalities.\n","authors":["Adam Tupper","Christian Gagné"],"pdf_url":"https://arxiv.org/pdf/2501.13193v1.pdf","comment":"For associated source code see\n https://github.com/adamtupper/ultrasound-augmentation"},{"id":"http://arxiv.org/abs/2501.13189v1","updated":"2025-01-22T19:40:04Z","published":"2025-01-22T19:40:04Z","title":"Map Prediction and Generative Entropy for Multi-Agent Exploration","summary":" Traditionally, autonomous reconnaissance applications have acted on explicit\nsets of historical observations. Aided by recent breakthroughs in generative\ntechnologies, this work enables robot teams to act beyond what is currently\nknown about the environment by inferring a distribution of reasonable\ninterpretations of the scene. We developed a map predictor that inpaints the\nunknown space in a multi-agent 2D occupancy map during an exploration mission.\nFrom a comparison of several inpainting methods, we found that a fine-tuned\nlatent diffusion inpainting model could provide rich and coherent\ninterpretations of simulated urban environments with relatively little\ncomputation time. By iteratively inferring interpretations of the scene\nthroughout an exploration run, we are able to identify areas that exhibit high\nuncertainty in the prediction, which we formalize with the concept of\ngenerative entropy. We prioritize tasks in regions of high generative entropy,\nhypothesizing that this will expedite convergence on an accurate predicted map\nof the scene. In our study we juxtapose this new paradigm of task ranking with\nthe state of the art, which ranks regions to explore by those which maximize\nexpected information recovery. We compare both of these methods in a simulated\nurban environment with three vehicles. Our results demonstrate that by using\nour new task ranking method, we can predict a correct scene significantly\nfaster than with a traditional information-guided method.\n","authors":["Alexander Spinos","Bradley Woosley","Justin Rokisky","Christopher Korpela","John G. Rogers III","Brian A. Bittner"],"pdf_url":"https://arxiv.org/pdf/2501.13189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13183v1","updated":"2025-01-22T19:30:28Z","published":"2025-01-22T19:30:28Z","title":"MONA: Moving Object Detection from Videos Shot by Dynamic Camera","summary":" Dynamic urban environments, characterized by moving cameras and objects, pose\nsignificant challenges for camera trajectory estimation by complicating the\ndistinction between camera-induced and object motion. We introduce MONA, a\nnovel framework designed for robust moving object detection and segmentation\nfrom videos shot by dynamic cameras. MONA comprises two key modules: Dynamic\nPoints Extraction, which leverages optical flow and tracking any point to\nidentify dynamic points, and Moving Object Segmentation, which employs adaptive\nbounding box filtering, and the Segment Anything for precise moving object\nsegmentation. We validate MONA by integrating with the camera trajectory\nestimation method LEAP-VO, and it achieves state-of-the-art results on the MPI\nSintel dataset comparing to existing methods. These results demonstrate MONA's\neffectiveness for moving object detection and its potential in many other\napplications in the urban planning field.\n","authors":["Boxun Hu","Mingze Xia","Ding Zhao","Guanlin Wu"],"pdf_url":"https://arxiv.org/pdf/2501.13183v1.pdf","comment":null}]},"2025-01-23T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.13928v1","updated":"2025-01-23T18:59:55Z","published":"2025-01-23T18:59:55Z","title":"Fast3R: Towards 3D Reconstruction of 1000+ Images in One Forward Pass","summary":" Multi-view 3D reconstruction remains a core challenge in computer vision,\nparticularly in applications requiring accurate and scalable representations\nacross diverse perspectives. Current leading methods such as DUSt3R employ a\nfundamentally pairwise approach, processing images in pairs and necessitating\ncostly global alignment procedures to reconstruct from multiple views. In this\nwork, we propose Fast 3D Reconstruction (Fast3R), a novel multi-view\ngeneralization to DUSt3R that achieves efficient and scalable 3D reconstruction\nby processing many views in parallel. Fast3R's Transformer-based architecture\nforwards N images in a single forward pass, bypassing the need for iterative\nalignment. Through extensive experiments on camera pose estimation and 3D\nreconstruction, Fast3R demonstrates state-of-the-art performance, with\nsignificant improvements in inference speed and reduced error accumulation.\nThese results establish Fast3R as a robust alternative for multi-view\napplications, offering enhanced scalability without compromising reconstruction\naccuracy.\n","authors":["Jianing Yang","Alexander Sax","Kevin J. Liang","Mikael Henaff","Hao Tang","Ang Cao","Joyce Chai","Franziska Meier","Matt Feiszli"],"pdf_url":"https://arxiv.org/pdf/2501.13928v1.pdf","comment":"Project website: https://fast3r-3d.github.io/"},{"id":"http://arxiv.org/abs/2501.13919v1","updated":"2025-01-23T18:58:03Z","published":"2025-01-23T18:58:03Z","title":"Temporal Preference Optimization for Long-Form Video Understanding","summary":" Despite significant advancements in video large multimodal models\n(video-LMMs), achieving effective temporal grounding in long-form videos\nremains a challenge for existing models. To address this limitation, we propose\nTemporal Preference Optimization (TPO), a novel post-training framework\ndesigned to enhance the temporal grounding capabilities of video-LMMs through\npreference learning. TPO adopts a self-training approach that enables models to\ndifferentiate between well-grounded and less accurate temporal responses by\nleveraging curated preference datasets at two granularities: localized temporal\ngrounding, which focuses on specific video segments, and comprehensive temporal\ngrounding, which captures extended temporal dependencies across entire video\nsequences. By optimizing on these preference datasets, TPO significantly\nenhances temporal understanding while reducing reliance on manually annotated\ndata. Extensive experiments on three long-form video understanding\nbenchmarks--LongVideoBench, MLVU, and Video-MME--demonstrate the effectiveness\nof TPO across two state-of-the-art video-LMMs. Notably, LLaVA-Video-TPO\nestablishes itself as the leading 7B model on the Video-MME benchmark,\nunderscoring the potential of TPO as a scalable and efficient solution for\nadvancing temporal reasoning in long-form video understanding. Project page:\nhttps://ruili33.github.io/tpo_website.\n","authors":["Rui Li","Xiaohan Wang","Yuhui Zhang","Zeyu Wang","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2501.13919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13876v1","updated":"2025-01-23T17:49:49Z","published":"2025-01-23T17:49:49Z","title":"FAST-LIVO2 on Resource-Constrained Platforms: LiDAR-Inertial-Visual\n Odometry with Efficient Memory and Computation","summary":" This paper presents a lightweight LiDAR-inertial-visual odometry system\noptimized for resource-constrained platforms. It integrates a\ndegeneration-aware adaptive visual frame selector into error-state iterated\nKalman filter (ESIKF) with sequential updates, improving computation efficiency\nsignificantly while maintaining a similar level of robustness. Additionally, a\nmemory-efficient mapping structure combining a locally unified visual-LiDAR map\nand a long-term visual map achieves a good trade-off between performance and\nmemory usage. Extensive experiments on x86 and ARM platforms demonstrate the\nsystem's robustness and efficiency. On the Hilti dataset, our system achieves a\n33% reduction in per-frame runtime and 47% lower memory usage compared to\nFAST-LIVO2, with only a 3 cm increase in RMSE. Despite this slight accuracy\ntrade-off, our system remains competitive, outperforming state-of-the-art\n(SOTA) LIO methods such as FAST-LIO2 and most existing LIVO systems. These\nresults validate the system's capability for scalable deployment on\nresource-constrained edge computing platforms.\n","authors":["Bingyang Zhou","Chunran Zheng","Ziming Wang","Fangcheng Zhu","Yixi Cai","Fu Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.13876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13855v1","updated":"2025-01-23T17:24:24Z","published":"2025-01-23T17:24:24Z","title":"First Lessons Learned of an Artificial Intelligence Robotic System for\n Autonomous Coarse Waste Recycling Using Multispectral Imaging-Based Methods","summary":" Current disposal facilities for coarse-grained waste perform manual sorting\nof materials with heavy machinery. Large quantities of recyclable materials are\nlost to coarse waste, so more effective sorting processes must be developed to\nrecover them. Two key aspects to automate the sorting process are object\ndetection with material classification in mixed piles of waste, and autonomous\ncontrol of hydraulic machinery. Because most objects in those accumulations of\nwaste are damaged or destroyed, object detection alone is not feasible in the\nmajority of cases. To address these challenges, we propose a classification of\nmaterials with multispectral images of ultraviolet (UV), visual (VIS), near\ninfrared (NIR), and short-wave infrared (SWIR) spectrums. Solution for\nautonomous control of hydraulic heavy machines for sorting of bulky waste is\nbeing investigated using cost-effective cameras and artificial\nintelligence-based controllers.\n","authors":["Timo Lange","Ajish Babu","Philipp Meyer","Matthis Keppner","Tim Tiedemann","Martin Wittmaier","Sebastian Wolff","Thomas Vögele"],"pdf_url":"https://arxiv.org/pdf/2501.13855v1.pdf","comment":"Published in Proceedings of Sardinia 2023, 19th International\n Symposium on Waste Management, Resource Recovery and Sustainable Landfilling"},{"id":"http://arxiv.org/abs/2501.13817v1","updated":"2025-01-23T16:39:08Z","published":"2025-01-23T16:39:08Z","title":"Temporal Logic Guided Safe Navigation for Autonomous Vehicles","summary":" Safety verification for autonomous vehicles (AVs) and ground robots is\ncrucial for ensuring reliable operation given their uncertain environments.\nFormal language tools provide a robust and sound method to verify safety rules\nfor such complex cyber-physical systems. In this paper, we propose a hybrid\napproach that combines the strengths of formal verification languages like\nLinear Temporal Logic (LTL) and Signal Temporal Logic (STL) to generate safe\ntrajectories and optimal control inputs for autonomous vehicle navigation. We\nimplement a symbolic path planning approach using LTL to generate a formally\nsafe reference trajectory. A mixed integer linear programming (MILP) solver is\nthen used on this reference trajectory to solve for the control inputs while\nsatisfying the state, control and safety constraints described by STL. We test\nour proposed solution on two environments and compare the results with popular\npath planning algorithms. In contrast to conventional path planning algorithms,\nour formally safe solution excels in handling complex specification scenarios\nwhile ensuring both safety and comparable computation times.\n","authors":["Aditya Parameshwaran","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2501.13817v1.pdf","comment":"6 pages, 5 figures, Modelling Estimation and Controls Conference-2024"},{"id":"http://arxiv.org/abs/2501.13804v1","updated":"2025-01-23T16:23:15Z","published":"2025-01-23T16:23:15Z","title":"Towards Real-World Validation of a Physics-Based Ship Motion Prediction\n Model","summary":" The maritime industry aims towards a sustainable future, which requires\nsignificant improvements in operational efficiency. Current approaches focus on\nminimising fuel consumption and emissions through greater autonomy. Efficient\nand safe autonomous navigation requires high-fidelity ship motion models\napplicable to real-world conditions. Although physics-based ship motion models\ncan predict ships' motion with sub-second resolution, their validation in\nreal-world conditions is rarely found in the literature. This study presents a\nphysics-based 3D dynamics motion model that is tailored to a container-ship,\nand compares its predictions against real-world voyages. The model integrates\nvessel motion over time and accounts for its hydrodynamic behavior under\ndifferent environmental conditions. The model's predictions are evaluated\nagainst real vessel data both visually and using multiple distance measures.\nBoth methodologies demonstrate that the model's predictions align closely with\nthe real-world trajectories of the container-ship.\n","authors":["Michail Mathioudakis","Christos Papandreou","Theodoros Stouraitis","Vicky Margari","Antonios Nikitakis","Stavros Paschalakis","Konstantinos Kyriakopoulos","Kostas J. Spyrou"],"pdf_url":"https://arxiv.org/pdf/2501.13804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04909v2","updated":"2025-01-23T15:31:37Z","published":"2024-02-07T14:38:51Z","title":"Entanglement Definitions for Tethered Robots: Exploration and Analysis","summary":" In this article we consider the problem of tether entanglement for tethered\nmobile robots. One of the main risks of using a tethered connection between a\nmobile robot and an anchor point is that the tether may get entangled with the\nobstacles present in the environment or with itself. To avoid these situations,\na non-entanglement constraint can be considered in the motion planning problem\nfor tethered robots. This constraint is typically expressed as a set of\nspecific tether configurations that must be avoided. However, the literature\nlacks a generally accepted definition of entanglement, with existing\ndefinitions being limited and partial in the sense that they only focus on\nspecific instances of entanglement. In practice, this means that the existing\ndefinitions do not effectively cover all instances of tether entanglement. Our\ngoal in this article is to bridge this gap and to provide new definitions of\nentanglement, which, together with the existing ones, can be effectively used\nto qualify the entanglement state of a tethered robot in diverse situations.\nThe new definitions find application in motion planning for tethered robots,\nwhere they can be used to obtain more safe and robust entanglement-free\ntrajectories.\n","authors":["Gianpietro Battocletti","Dimitris Boskos","Domagoj Tolić","Ivana Palunko","Bart De Schutter"],"pdf_url":"https://arxiv.org/pdf/2402.04909v2.pdf","comment":"18 pages, 9 figures. Published on IEEE Access"},{"id":"http://arxiv.org/abs/2501.13725v1","updated":"2025-01-23T14:58:49Z","published":"2025-01-23T14:58:49Z","title":"You Only Crash Once v2: Perceptually Consistent Strong Features for\n One-Stage Domain Adaptive Detection of Space Terrain","summary":" The in-situ detection of planetary, lunar, and small-body surface terrain is\ncrucial for autonomous spacecraft applications, where learning-based computer\nvision methods are increasingly employed to enable intelligence without prior\ninformation or human intervention. However, many of these methods remain\ncomputationally expensive for spacecraft processors and prevent real-time\noperation. Training of such algorithms is additionally complex due to the\nscarcity of labeled data and reliance on supervised learning approaches.\nUnsupervised Domain Adaptation (UDA) offers a promising solution by\nfacilitating model training with disparate data sources such as simulations or\nsynthetic scenes, although UDA is difficult to apply to celestial environments\nwhere challenging feature spaces are paramount. To alleviate such issues, You\nOnly Crash Once (YOCOv1) has studied the integration of Visual Similarity-based\nAlignment (VSA) into lightweight one-stage object detection architectures to\nimprove space terrain UDA. Although proven effective, the approach faces\nnotable limitations, including performance degradations in multi-class and\nhigh-altitude scenarios. Building upon the foundation of YOCOv1, we propose\nnovel additions to the VSA scheme that enhance terrain detection capabilities\nunder UDA, and our approach is evaluated across both simulated and real-world\ndata. Our second YOCO rendition, YOCOv2, is capable of achieving\nstate-of-the-art UDA performance on surface terrain detection, where we\nshowcase improvements upwards of 31% compared with YOCOv1 and terrestrial\nstate-of-the-art. We demonstrate the practical utility of YOCOv2 with\nspacecraft flight hardware performance benchmarking and qualitative evaluation\nof NASA mission data.\n","authors":["Timothy Chase Jr","Christopher Wilson","Karthik Dantu"],"pdf_url":"https://arxiv.org/pdf/2501.13725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11683v3","updated":"2025-01-23T14:45:03Z","published":"2024-11-18T16:09:26Z","title":"TrojanRobot: Physical-World Backdoor Attacks Against VLM-based Robotic\n Manipulation","summary":" Robotic manipulation in the physical world is increasingly empowered by\n\\textit{large language models} (LLMs) and \\textit{vision-language models}\n(VLMs), leveraging their understanding and perception capabilities. Recently,\nvarious attacks against such robotic policies have been proposed, with backdoor\nattacks drawing considerable attention for their high stealth and strong\npersistence capabilities. However, existing backdoor efforts are limited to\nsimulators and suffer from physical-world realization. To address this, we\npropose \\textit{TrojanRobot}, a highly stealthy and broadly effective robotic\nbackdoor attack in the physical world. Specifically, we introduce a\nmodule-poisoning approach by embedding a backdoor module into the modular\nrobotic policy, enabling backdoor control over the policy's visual perception\nmodule thereby backdooring the entire robotic policy. Our vanilla\nimplementation leverages a backdoor-finetuned VLM to serve as the backdoor\nmodule. To enhance its generalization in physical environments, we propose a\nprime implementation, leveraging the LVLM-as-a-backdoor paradigm and developing\nthree types of prime attacks, \\ie, \\textit{permutation}, \\textit{stagnation},\nand \\textit{intentional} attacks, thus achieving finer-grained backdoors.\nExtensive experiments on the UR3e manipulator with 18 task instructions using\nrobotic policies based on four VLMs demonstrate the broad effectiveness and\nphysical-world stealth of TrojanRobot. Our attack's video demonstrations are\navailable via a github link \\url{https://trojanrobot.github.io}.\n","authors":["Xianlong Wang","Hewen Pan","Hangtao Zhang","Minghui Li","Shengshan Hu","Ziqi Zhou","Lulu Xue","Peijin Guo","Yichen Wang","Wei Wan","Aishan Liu","Leo Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11683v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13641v1","updated":"2025-01-23T13:18:52Z","published":"2025-01-23T13:18:52Z","title":"The Road to Learning Explainable Inverse Kinematic Models: Graph Neural\n Networks as Inductive Bias for Symbolic Regression","summary":" This paper shows how a Graph Neural Network (GNN) can be used to learn an\nInverse Kinematics (IK) based on an automatically generated dataset. The\ngenerated Inverse Kinematics is generalized to a family of manipulators with\nthe same Degree of Freedom (DOF), but varying link length configurations. The\nresults indicate a position error of less than 1.0 cm for 3 DOF and 4.5 cm for\n5 DOF, and orientation error of 2$^\\circ$ for 3 DOF and 8.2$^\\circ$ for 6 DOF,\nwhich allows the deployment to certain real world-problems. However,\nout-of-domain errors and lack of extrapolation can be observed in the resulting\nGNN. An extensive analysis of these errors indicates potential for enhancement\nin the future. Consequently, the generated GNNs are tailored to be used in\nfuture work as an inductive bias to generate analytical equations through\nsymbolic regression.\n","authors":["Pravin Pandey","Julia Reuter","Christoph Steup","Sanaz Mostaghim"],"pdf_url":"https://arxiv.org/pdf/2501.13641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11221v2","updated":"2025-01-23T12:35:51Z","published":"2024-02-17T08:32:22Z","title":"MOB-Net: Limb-modularized Uncertainty Torque Learning of Humanoids for\n Sensorless External Torque Estimation","summary":" Momentum observer (MOB) can estimate external joint torque without requiring\nadditional sensors, such as force/torque or joint torque sensors. However, the\nestimation performance of MOB deteriorates due to the model uncertainty which\nencompasses the modeling errors and the joint friction. Moreover, the\nestimation error is significant when MOB is applied to high-dimensional\nfloating-base humanoids, which prevents the estimated external joint torque\nfrom being used for force control or collision detection in the real humanoid\nrobot. In this paper, the pure external joint torque estimation method named\nMOB-Net, is proposed for humanoids. MOB-Net learns the model uncertainty torque\nand calibrates the estimated signal of MOB. The external joint torque can be\nestimated in the generalized coordinate including whole-body and virtual joints\nof the floating-base robot with only internal sensors (an IMU on the pelvis and\nencoders in the joints). Our method substantially reduces the estimation errors\nof MOB, and the robust performance of MOB-Net for the unseen data is validated\nthrough extensive simulations, real robot experiments, and ablation studies.\nFinally, various collision handling scenarios are presented using the estimated\nexternal joint torque from MOB-Net: contact wrench feedback control for\nlocomotion, collision detection, and collision reaction for safety.\n","authors":["Daegyu Lim","Myeong-Ju Kim","Junhyeok Cha","Jaeheung Park"],"pdf_url":"https://arxiv.org/pdf/2402.11221v2.pdf","comment":"Published to IJRR"},{"id":"http://arxiv.org/abs/2501.09600v4","updated":"2025-01-23T11:25:43Z","published":"2025-01-16T15:22:06Z","title":"Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid\n Prototyping in Virtual Reality Applications","summary":" SLAM is a foundational technique with broad applications in robotics and\nAR/VR. SLAM simulations evaluate new concepts, but testing on\nresource-constrained devices, such as VR HMDs, faces challenges: high\ncomputational cost and restricted sensor data access. This work proposes a\nsparse framework using mesh geometry projections as features, which improves\nefficiency and circumvents direct sensor data access, advancing SLAM research\nas we demonstrate in VR and through numerical evaluation.\n","authors":["Carlos Augusto Pinheiro de Sousa","Heiko Hamann","Oliver Deussen"],"pdf_url":"https://arxiv.org/pdf/2501.09600v4.pdf","comment":"Accepted to ENPT XR at IEEE VR 2025"},{"id":"http://arxiv.org/abs/2501.13507v1","updated":"2025-01-23T09:43:16Z","published":"2025-01-23T09:43:16Z","title":"Iterative Shaping of Multi-Particle Aggregates based on Action Trees and\n VLM","summary":" In this paper, we address the problem of manipulating multi-particle\naggregates using a bimanual robotic system. Our approach enables the autonomous\ntransport of dispersed particles through a series of shaping and pushing\nactions using robotically-controlled tools. Achieving this advanced\nmanipulation capability presents two key challenges: high-level task planning\nand trajectory execution. For task planning, we leverage Vision Language Models\n(VLMs) to enable primitive actions such as tool affordance grasping and\nnon-prehensile particle pushing. For trajectory execution, we represent the\nevolving particle aggregate's contour using truncated Fourier series, providing\nefficient parametrization of its closed shape. We adaptively compute trajectory\nwaypoints based on group cohesion and the geometric centroid of the aggregate,\naccounting for its spatial distribution and collective motion. Through\nreal-world experiments, we demonstrate the effectiveness of our methodology in\nactively shaping and manipulating multi-particle aggregates while maintaining\nhigh system cohesion.\n","authors":["Hoi-Yin Lee","Peng Zhou","Anqing Duan","Chenguang Yang","David Navarro-Alarcon"],"pdf_url":"https://arxiv.org/pdf/2501.13507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13461v1","updated":"2025-01-23T08:23:45Z","published":"2025-01-23T08:23:45Z","title":"Knowledge-Informed Multi-Agent Trajectory Prediction at Signalized\n Intersections for Infrastructure-to-Everything","summary":" Multi-agent trajectory prediction at signalized intersections is crucial for\ndeveloping efficient intelligent transportation systems and safe autonomous\ndriving systems. Due to the complexity of intersection scenarios and the\nlimitations of single-vehicle perception, the performance of vehicle-centric\nprediction methods has reached a plateau. Furthermore, most works underutilize\ncritical intersection information, including traffic signals, and behavior\npatterns induced by road structures. Therefore, we propose a multi-agent\ntrajectory prediction framework at signalized intersections dedicated to\nInfrastructure-to-Everything (I2XTraj). Our framework leverages dynamic graph\nattention to integrate knowledge from traffic signals and driving behaviors. A\ncontinuous signal-informed mechanism is proposed to adaptively process\nreal-time traffic signals from infrastructure devices. Additionally, leveraging\nthe prior knowledge of the intersection topology, we propose a driving strategy\nawareness mechanism to model the joint distribution of goal intentions and\nmaneuvers. To the best of our knowledge, I2XTraj represents the first\nmulti-agent trajectory prediction framework explicitly designed for\ninfrastructure deployment, supplying subscribable prediction services to all\nvehicles at intersections. I2XTraj demonstrates state-of-the-art performance on\nboth the Vehicle-to-Infrastructure dataset V2X-Seq and the aerial-view dataset\nSinD for signalized intersections. Quantitative evaluations show that our\napproach outperforms existing methods by more than 30% in both multi-agent and\nsingle-agent scenarios.\n","authors":["Huilin Yin","Yangwenhui Xu","Jiaxiang Li","Hao Zhang","Gerhard Rigoll"],"pdf_url":"https://arxiv.org/pdf/2501.13461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13457v1","updated":"2025-01-23T08:15:52Z","published":"2025-01-23T08:15:52Z","title":"Zero-Shot Trajectory Planning for Signal Temporal Logic Tasks","summary":" Signal Temporal Logic (STL) is a powerful specification language for\ndescribing complex temporal behaviors of continuous signals, making it\nwell-suited for high-level robotic task descriptions. However, generating\nexecutable plans for STL tasks is challenging, as it requires consideration of\nthe coupling between the task specification and the system dynamics. Existing\napproaches either follow a model-based setting that explicitly requires\nknowledge of the system dynamics or adopt a task-oriented data-driven approach\nto learn plans for specific tasks. In this work, we investigate the problem of\ngenerating executable STL plans for systems whose dynamics are unknown a\npriori. We propose a new planning framework that uses only task-agnostic data\nduring the offline training stage, enabling zero-shot generalization to new STL\ntasks. Our framework is hierarchical, involving: (i) decomposing the STL task\ninto a set of progress and time constraints, (ii) searching for time-aware\nwaypoints guided by task-agnostic data, and (iii) generating trajectories using\na pre-trained safe diffusion model. Simulation results demonstrate the\neffectiveness of our method indeed in achieving zero-shot generalization to\nvarious STL tasks.\n","authors":["Ruijia Liu","Ancheng Hou","Xiao Yu","Xiang Yin"],"pdf_url":"https://arxiv.org/pdf/2501.13457v1.pdf","comment":"submitted"},{"id":"http://arxiv.org/abs/2501.13432v1","updated":"2025-01-23T07:35:47Z","published":"2025-01-23T07:35:47Z","title":"Emotion estimation from video footage with LSTM","summary":" Emotion estimation in general is a field that has been studied for a long\ntime, and several approaches exist using machine learning. in this paper, we\npresent an LSTM model, that processes the blend-shapes produced by the library\nMediaPipe, for a face detected in a live stream of a camera, to estimate the\nmain emotion from the facial expressions, this model is trained on the FER2013\ndataset and delivers a result of 71% accuracy and 62% f1-score which meets the\naccuracy benchmark of the FER2013 dataset, with significantly reduced\ncomputation costs. https://github.com/\nSamir-atra/Emotion_estimation_from_video_footage_with_LSTM_ML_algorithm\n","authors":["Samer Attrah"],"pdf_url":"https://arxiv.org/pdf/2501.13432v1.pdf","comment":"11 pages, 6 figures, 32 references, 4 tables"},{"id":"http://arxiv.org/abs/2501.00368v3","updated":"2025-01-23T07:04:34Z","published":"2024-12-31T09:44:18Z","title":"Design Optimizer for Soft Growing Robot Manipulators in\n Three-Dimensional Environments","summary":" Soft growing robots are novel devices that mimic plant-like growth for\nnavigation in cluttered or dangerous environments. Their ability to adapt to\nsurroundings, combined with advancements in actuation and manufacturing\ntechnologies, allows them to perform specialized manipulation tasks. This work\npresents an approach for design optimization of soft growing robots;\nspecifically, the three-dimensional extension of the optimizer designed for\nplanar manipulators. This tool is intended to be used by engineers and robot\nenthusiasts before manufacturing their robot: it suggests the optimal size of\nthe robot for solving a specific task. The design process models a\nmulti-objective optimization problem to refine a soft manipulator's kinematic\nchain. Thanks to the novel Rank Partitioning algorithm integrated into\nEvolutionary Computation (EC) algorithms, this method achieves high precision\nin reaching targets and is efficient in resource usage. Results show\nsignificantly high performance in solving three-dimensional tasks, whereas\ncomparative experiments indicate that the optimizer features robust output when\ntested with different EC algorithms, particularly genetic algorithms.\n","authors":["Ahmet Astar","Ozan Nurcan","Erk Demirel","Emir Ozen","Ozan Kutlar","Fabio Stroppa"],"pdf_url":"https://arxiv.org/pdf/2501.00368v3.pdf","comment":"17 pages, 10 figures"},{"id":"http://arxiv.org/abs/2501.13417v1","updated":"2025-01-23T06:43:38Z","published":"2025-01-23T06:43:38Z","title":"GeomGS: LiDAR-Guided Geometry-Aware Gaussian Splatting for Robot\n Localization","summary":" Mapping and localization are crucial problems in robotics and autonomous\ndriving. Recent advances in 3D Gaussian Splatting (3DGS) have enabled precise\n3D mapping and scene understanding by rendering photo-realistic images.\nHowever, existing 3DGS methods often struggle to accurately reconstruct a 3D\nmap that reflects the actual scale and geometry of the real world, which\ndegrades localization performance. To address these limitations, we propose a\nnovel 3DGS method called Geometry-Aware Gaussian Splatting (GeomGS). This\nmethod fully integrates LiDAR data into 3D Gaussian primitives via a\nprobabilistic approach, as opposed to approaches that only use LiDAR as initial\npoints or introduce simple constraints for Gaussian points. To this end, we\nintroduce a Geometric Confidence Score (GCS), which identifies the structural\nreliability of each Gaussian point. The GCS is optimized simultaneously with\nGaussians under probabilistic distance constraints to construct a precise\nstructure. Furthermore, we propose a novel localization method that fully\nutilizes both the geometric and photometric properties of GeomGS. Our GeomGS\ndemonstrates state-of-the-art geometric and localization performance across\nseveral benchmarks, while also improving photometric performance.\n","authors":["Jaewon Lee","Mangyu Kong","Minseong Park","Euntai Kim"],"pdf_url":"https://arxiv.org/pdf/2501.13417v1.pdf","comment":"Preprint, Under review"},{"id":"http://arxiv.org/abs/2501.13416v1","updated":"2025-01-23T06:42:28Z","published":"2025-01-23T06:42:28Z","title":"M3PT: A Transformer for Multimodal, Multi-Party Social Signal Prediction\n with Person-aware Blockwise Attention","summary":" Understanding social signals in multi-party conversations is important for\nhuman-robot interaction and artificial social intelligence. Multi-party\ninteractions include social signals like body pose, head pose, speech, and\ncontext-specific activities like acquiring and taking bites of food when\ndining. Incorporating all the multimodal signals in a multi-party interaction\nis difficult, and past work tends to build task-specific models for predicting\nsocial signals. In this work, we address the challenge of predicting multimodal\nsocial signals in multi-party settings in a single model. We introduce M3PT, a\ncausal transformer architecture with modality and temporal blockwise attention\nmasking which allows for the simultaneous processing of multiple social cues\nacross multiple participants and their temporal interactions. This approach\nbetter captures social dynamics over time by considering longer horizons of\nsocial signals between individuals. We train and evaluate our unified model on\nthe Human-Human Commensality Dataset (HHCD), and demonstrate that using\nmultiple modalities improves bite timing and speaking status prediction. Source\ncode: https://github.com/AbrarAnwar/masked-social-signals/\n","authors":["Yiming Tang","Abrar Anwar","Jesse Thomason"],"pdf_url":"https://arxiv.org/pdf/2501.13416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13402v1","updated":"2025-01-23T06:01:03Z","published":"2025-01-23T06:01:03Z","title":"VIGS SLAM: IMU-based Large-Scale 3D Gaussian Splatting SLAM","summary":" Recently, map representations based on radiance fields such as 3D Gaussian\nSplatting and NeRF, which excellent for realistic depiction, have attracted\nconsiderable attention, leading to attempts to combine them with SLAM. While\nthese approaches can build highly realistic maps, large-scale SLAM still\nremains a challenge because they require a large number of Gaussian images for\nmapping and adjacent images as keyframes for tracking. We propose a novel 3D\nGaussian Splatting SLAM method, VIGS SLAM, that utilizes sensor fusion of RGB-D\nand IMU sensors for large-scale indoor environments. To reduce the\ncomputational load of 3DGS-based tracking, we adopt an ICP-based tracking\nframework that combines IMU preintegration to provide a good initial guess for\naccurate pose estimation. Our proposed method is the first to propose that\nGaussian Splatting-based SLAM can be effectively performed in large-scale\nenvironments by integrating IMU sensor measurements. This proposal not only\nenhances the performance of Gaussian Splatting SLAM beyond room-scale scenarios\nbut also achieves SLAM performance comparable to state-of-the-art methods in\nlarge-scale indoor environments.\n","authors":["Gyuhyeon Pak","Euntai Kim"],"pdf_url":"https://arxiv.org/pdf/2501.13402v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2209.08812v5","updated":"2025-01-23T05:49:17Z","published":"2022-09-19T07:52:02Z","title":"Generative Graphical Inverse Kinematics","summary":" Quickly and reliably finding accurate inverse kinematics (IK) solutions\nremains a challenging problem for many robot manipulators. Existing numerical\nsolvers are broadly applicable but typically only produce a single solution and\nrely on local search techniques to minimize nonconvex objective functions. More\nrecent learning-based approaches that approximate the entire feasible set of\nsolutions have shown promise as a means to generate multiple fast and accurate\nIK results in parallel. However, existing learning-based techniques have a\nsignificant drawback: each robot of interest requires a specialized model that\nmust be trained from scratch. To address this key shortcoming, we propose a\nnovel distance-geometric robot representation coupled with a graph structure\nthat allows us to leverage the sample efficiency of Euclidean equivariant\nfunctions and the generalizability of graph neural networks (GNNs). Our\napproach is generative graphical inverse kinematics (GGIK), the first learned\nIK solver able to accurately and efficiently produce a large number of diverse\nsolutions in parallel while also displaying the ability to generalize -- a\nsingle learned model can be used to produce IK solutions for a variety of\ndifferent robots. When compared to several other learned IK methods, GGIK\nprovides more accurate solutions with the same amount of data. GGIK can\ngeneralize reasonably well to robot manipulators unseen during training.\nAdditionally, GGIK can learn a constrained distribution that encodes joint\nlimits and scales efficiently to larger robots and a high number of sampled\nsolutions. Finally, GGIK can be used to complement local IK solvers by\nproviding reliable initializations for a local optimization process.\n","authors":["Oliver Limoyo","Filip Marić","Matthew Giamou","Petra Alexson","Ivan Petrović","Jonathan Kelly"],"pdf_url":"https://arxiv.org/pdf/2209.08812v5.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.13072v2","updated":"2025-01-23T04:15:32Z","published":"2025-01-22T18:34:51Z","title":"AdaWM: Adaptive World Model based Planning for Autonomous Driving","summary":" World model based reinforcement learning (RL) has emerged as a promising\napproach for autonomous driving, which learns a latent dynamics model and uses\nit to train a planning policy. To speed up the learning process, the\npretrain-finetune paradigm is often used, where online RL is initialized by a\npretrained model and a policy learned offline. However, naively performing such\ninitialization in RL may result in dramatic performance degradation during the\nonline interactions in the new task. To tackle this challenge, we first analyze\nthe performance degradation and identify two primary root causes therein: the\nmismatch of the planning policy and the mismatch of the dynamics model, due to\ndistribution shift. We further analyze the effects of these factors on\nperformance degradation during finetuning, and our findings reveal that the\nchoice of finetuning strategies plays a pivotal role in mitigating these\neffects. We then introduce AdaWM, an Adaptive World Model based planning\nmethod, featuring two key steps: (a) mismatch identification, which quantifies\nthe mismatches and informs the finetuning strategy, and (b) alignment-driven\nfinetuning, which selectively updates either the policy or the model as needed\nusing efficient low-rank updates. Extensive experiments on the challenging\nCARLA driving tasks demonstrate that AdaWM significantly improves the\nfinetuning process, resulting in more robust and efficient performance in\nautonomous driving systems.\n","authors":["Hang Wang","Xin Ye","Feng Tao","Chenbin Pan","Abhirup Mallik","Burhaneddin Yaman","Liu Ren","Junshan Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.13072v2.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2501.13338v1","updated":"2025-01-23T02:39:04Z","published":"2025-01-23T02:39:04Z","title":"CuriousBot: Interactive Mobile Exploration via Actionable 3D Relational\n Object Graph","summary":" Mobile exploration is a longstanding challenge in robotics, yet current\nmethods primarily focus on active perception instead of active interaction,\nlimiting the robot's ability to interact with and fully explore its\nenvironment. Existing robotic exploration approaches via active interaction are\noften restricted to tabletop scenes, neglecting the unique challenges posed by\nmobile exploration, such as large exploration spaces, complex action spaces,\nand diverse object relations. In this work, we introduce a 3D relational object\ngraph that encodes diverse object relations and enables exploration through\nactive interaction. We develop a system based on this representation and\nevaluate it across diverse scenes. Our qualitative and quantitative results\ndemonstrate the system's effectiveness and generalization capabilities,\noutperforming methods that rely solely on vision-language models (VLMs).\n","authors":["Yixuan Wang","Leonor Fermoselle","Tarik Kelestemur","Jiuguang Wang","Yunzhu Li"],"pdf_url":"https://arxiv.org/pdf/2501.13338v1.pdf","comment":"Project Page: https://curiousbot.theaiinstitute.com/"},{"id":"http://arxiv.org/abs/2410.14565v2","updated":"2025-01-23T02:34:56Z","published":"2024-10-18T16:10:50Z","title":"Graph Optimality-Aware Stochastic LiDAR Bundle Adjustment with\n Progressive Spatial Smoothing","summary":" Large-scale LiDAR Bundle Adjustment (LBA) to refine sensor orientation and\npoint cloud accuracy simultaneously to build the navigation map is a\nfundamental task in logistics and robotics. Unlike pose-graph-based methods\nthat rely solely on pairwise relationships between LiDAR frames, LBA leverages\nraw LiDAR correspondences to achieve more precise results, especially when\ninitial pose estimates are unreliable for low-cost sensors. However, existing\nLBA methods face challenges such as simplistic planar correspondences,\nextensive observations, and dense normal matrices in the least-squares problem,\nwhich limit robustness, efficiency, and scalability. To address these issues,\nwe propose a Graph Optimality-aware Stochastic Optimization scheme with\nProgressive Spatial Smoothing, namely PSS-GOSO, to achieve \\textit{robust},\n\\textit{efficient}, and \\textit{scalable} LBA. The Progressive Spatial\nSmoothing (PSS) module extracts \\textit{robust} LiDAR feature association\nexploiting the prior structure information obtained by the polynomial smooth\nkernel. The Graph Optimality-aware Stochastic Optimization (GOSO) module first\nsparsifies the graph according to optimality for an \\textit{efficient}\noptimization. GOSO then utilizes stochastic clustering and graph\nmarginalization to solve the large-scale state estimation problem for a\n\\textit{scalable} LBA. We validate PSS-GOSO across diverse scenes captured by\nvarious platforms, demonstrating its superior performance compared to existing\nmethods. Moreover, the resulting point cloud maps are used for automatic\nlast-mile delivery in large-scale complex scenes. The project page can be found\nat: \\url{https://kafeiyin00.github.io/PSS-GOSO/}.\n","authors":["Jianping Li","Thien-Minh Nguyen","Muqing Cao","Shenghai Yuan","Tzu-Yi Hung","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2410.14565v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10074v3","updated":"2025-01-23T02:31:25Z","published":"2025-01-17T09:46:27Z","title":"SpatialCoT: Advancing Spatial Reasoning through Coordinate Alignment and\n Chain-of-Thought for Embodied Task Planning","summary":" Spatial reasoning is an essential problem in embodied AI research. Efforts to\nenhance spatial reasoning abilities through supplementary spatial data and\nfine-tuning have proven limited and ineffective when addressing complex\nembodied tasks, largely due to their dependence on language-based outputs.\nWhile some approaches have introduced a point-based action space to mitigate\nthis issue, they fall short in managing more intricate tasks within complex\nenvironments. This deficiency arises from their failure to fully exploit the\ninherent thinking and reasoning capabilities that are fundamental strengths of\nVision-Language Models (VLMs). To address these limitations, we propose a novel\napproach named SpatialCoT, specifically designed to bolster the spatial\nreasoning capabilities of VLMs. Our approach comprises two stages: spatial\ncoordinate bi-directional alignment, which aligns vision-language inputs with\nspatial coordinates, and chain-of-thought spatial grounding, which harnesses\nthe reasoning capabilities of language models for advanced spatial reasoning.\nWe evaluate SpatialCoT on challenging navigation and manipulation tasks, both\nin simulation and real-world settings. Experimental results demonstrate that\nour method significantly outperforms previous state-of-the-art approaches in\nboth tasks.\n","authors":["Yuecheng Liu","Dafeng Chi","Shiguang Wu","Zhanguang Zhang","Yaochen Hu","Lingfeng Zhang","Yingxue Zhang","Shuang Wu","Tongtong Cao","Guowei Huang","Helong Huang","Guangjian Tian","Weichao Qiu","Xingyue Quan","Jianye Hao","Yuzheng Zhuang"],"pdf_url":"https://arxiv.org/pdf/2501.10074v3.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2309.12397v2","updated":"2025-01-23T22:17:17Z","published":"2023-09-21T18:00:34Z","title":"POLAR-Sim: Augmenting NASA's POLAR Dataset for Data-Driven Lunar\n Perception and Rover Simulation","summary":" NASA's POLAR dataset contains approximately 2,600 pairs of high dynamic range\nstereo photos captured across 13 varied terrain scenarios, including areas with\nsparse or dense rock distributions, craters, and rocks of different sizes. The\npurpose of these photos is to spur development in robotics, AI-based\nperception, and autonomous navigation. Acknowledging a scarcity of lunar images\nfrom around the lunar poles, NASA Ames produced on Earth but in controlled\nconditions images that resemble rover operating conditions from these regions\nof the Moon. We report on the outcomes of an effort aimed at accomplishing two\ntasks. In Task 1, we provided bounding boxes and semantic segmentation\ninformation for all the images in NASA's POLAR dataset. This effort resulted in\n23,000 labels and semantic segmentation annotations pertaining to rocks,\nshadows, and craters. In Task 2, we generated the digital twins of the 13\nscenarios that have been used to produce all the photos in the POLAR dataset.\nSpecifically, for each of these scenarios, we produced individual meshes,\ntexture information, and material properties associated with the ground and the\nrocks in each scenario. This allows anyone with a camera model to synthesize\nimages associated with any of the 13 scenarios of the POLAR dataset.\nEffectively, one can generate as many semantically labeled synthetic images as\ndesired -- with different locations and exposure values in the scene, for\ndifferent positions of the sun, with or without the presence of active\nillumination, etc. The benefit of this work is twofold. Using outcomes of Task\n1, one can train and/or test perception algorithms that deal with Moon images.\nFor Task 2, one can produce as much data as desired to train and test AI\nalgorithms that are anticipated to work in lunar conditions. All the outcomes\nof this work are available in a public repository for unfettered use and\ndistribution.\n","authors":["Bo-Hsun Chen","Peter Negrut","Thomas Liang","Nevindu Batagoda","Harry Zhang","Dan Negrut"],"pdf_url":"https://arxiv.org/pdf/2309.12397v2.pdf","comment":"11 pages, 9 figures. This work has been submitted to the IEEE for\n possible publication"},{"id":"http://arxiv.org/abs/2308.03496v2","updated":"2025-01-23T21:53:16Z","published":"2023-08-07T11:43:02Z","title":"Design and Implementation of an Efficient Onboard Computer System for\n CanSat Atmosphere Monitoring","summary":" With advancements in technology, the smaller versions of satellites have\ngained momentum in the space industry for earth monitoring and\ncommunication-based applications. The rise of CanSat technology has\nsignificantly impacted the space industry by providing a cost-effective\nsolution for space exploration. CanSat is a simulation model of a real\nsatellite and plays a crucial role in collecting and transmitting atmospheric\ndata. This paper discusses the design of an Onboard Computer System forCanSat,\nused to study various environmental parameters by monitoring the concentrations\nof gases in the atmosphere. The Onboard Computer System uses GPS,\naccelerometer, altitude, temperature, pressure, gyroscope, magnetometer, UV\nradiation, and air quality sensors for atmospheric sensing. A highly efficient\nand low-power ESP32 microcontroller and a transceiver module are used to\nacquire data, facilitate seamless communication and transmit the collected data\nto the ground station.\n","authors":["Abhijit Gadekar"],"pdf_url":"https://arxiv.org/pdf/2308.03496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14099v1","updated":"2025-01-23T21:14:55Z","published":"2025-01-23T21:14:55Z","title":"The Perceived Danger (PD) Scale: Development and Validation","summary":" There are currently no psychometrically valid tools to measure the perceived\ndanger of robots. To fill this gap, we provided a definition of perceived\ndanger and developed and validated a 12-item bifactor scale through four\nstudies. An exploratory factor analysis revealed four subdimensions of\nperceived danger: affective states, physical vulnerability, ominousness, and\ncognitive readiness. A confirmatory factor analysis confirmed the bifactor\nmodel. We then compared the perceived danger scale to the Godspeed perceived\nsafety scale and found that the perceived danger scale is a better predictor of\nempirical data. We also validated the scale in an in-person setting and found\nthat the perceived danger scale is sensitive to robot speed manipulations,\nconsistent with previous empirical findings. Results across experiments suggest\nthat the perceived danger scale is reliable, valid, and an adequate predictor\nof both perceived safety and perceived danger in human-robot interaction\ncontexts.\n","authors":["Jaclyn Molan","Laura Saad","Eileen Roesler","J. Malcolm McCurry","Nathaniel Gyory","J. Gregory Trafton"],"pdf_url":"https://arxiv.org/pdf/2501.14099v1.pdf","comment":"9 pages, 2 figures, to be published in the Proceedings of the 2025\n ACM/IEEE International Conference on Human-Robot Interaction (HRI)"},{"id":"http://arxiv.org/abs/2501.13996v1","updated":"2025-01-23T10:57:27Z","published":"2025-01-23T10:57:27Z","title":"Integrating Persian Lip Reading in Surena-V Humanoid Robot for\n Human-Robot Interaction","summary":" Lip reading is vital for robots in social settings, improving their ability\nto understand human communication. This skill allows them to communicate more\neasily in crowded environments, especially in caregiving and customer service\nroles. Generating a Persian Lip-reading dataset, this study integrates Persian\nlip-reading technology into the Surena-V humanoid robot to improve its speech\nrecognition capabilities. Two complementary methods are explored, an indirect\nmethod using facial landmark tracking and a direct method leveraging\nconvolutional neural networks (CNNs) and long short-term memory (LSTM)\nnetworks. The indirect method focuses on tracking key facial landmarks,\nespecially around the lips, to infer movements, while the direct method\nprocesses raw video data for action and speech recognition. The best-performing\nmodel, LSTM, achieved 89\\% accuracy and has been successfully implemented into\nthe Surena-V robot for real-time human-robot interaction. The study highlights\nthe effectiveness of these methods, particularly in environments where verbal\ncommunication is limited.\n","authors":["Ali Farshian Abbasi","Aghil Yousefi-Koma","Soheil Dehghani Firouzabadi","Parisa Rashidi","Alireza Naeini"],"pdf_url":"https://arxiv.org/pdf/2501.13996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13994v1","updated":"2025-01-23T10:44:35Z","published":"2025-01-23T10:44:35Z","title":"CSAOT: Cooperative Multi-Agent System for Active Object Tracking","summary":" Object Tracking is essential for many computer vision applications, such as\nautonomous navigation, surveillance, and robotics. Unlike Passive Object\nTracking (POT), which relies on static camera viewpoints to detect and track\nobjects across consecutive frames, Active Object Tracking (AOT) requires a\ncontroller agent to actively adjust its viewpoint to maintain visual contact\nwith a moving target in complex environments. Existing AOT solutions are\npredominantly single-agent-based, which struggle in dynamic and complex\nscenarios due to limited information gathering and processing capabilities,\noften resulting in suboptimal decision-making. Alleviating these limitations\nnecessitates the development of a multi-agent system where different agents\nperform distinct roles and collaborate to enhance learning and robustness in\ndynamic and complex environments. Although some multi-agent approaches exist\nfor AOT, they typically rely on external auxiliary agents, which require\nadditional devices, making them costly. In contrast, we introduce the\nCollaborative System for Active Object Tracking (CSAOT), a method that\nleverages multi-agent deep reinforcement learning (MADRL) and a Mixture of\nExperts (MoE) framework to enable multiple agents to operate on a single\ndevice, thereby improving tracking performance and reducing costs. Our approach\nenhances robustness against occlusions and rapid motion while optimizing camera\nmovements to extend tracking duration. We validated the effectiveness of CSAOT\non various interactive maps with dynamic and stationary obstacles.\n","authors":["Hy Nguyen","Bao Pham","Hung Du","Srikanth Thudumu","Rajesh Vasa","Kon Mouzakis"],"pdf_url":"https://arxiv.org/pdf/2501.13994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13988v1","updated":"2025-01-23T08:27:15Z","published":"2025-01-23T08:27:15Z","title":"MCRL4OR: Multimodal Contrastive Representation Learning for Off-Road\n Environmental Perception","summary":" Most studies on environmental perception for autonomous vehicles (AVs) focus\non urban traffic environments, where the objects/stuff to be perceived are\nmainly from man-made scenes and scalable datasets with dense annotations can be\nused to train supervised learning models. By contrast, it is hard to densely\nannotate a large-scale off-road driving dataset manually due to the inherently\nunstructured nature of off-road environments. In this paper, we propose a\nMultimodal Contrastive Representation Learning approach for Off-Road\nenvironmental perception, namely MCRL4OR. This approach aims to jointly learn\nthree encoders for processing visual images, locomotion states, and control\nactions by aligning the locomotion states with the fused features of visual\nimages and control actions within a contrastive learning framework. The\ncausation behind this alignment strategy is that the inertial locomotion state\nis the result of taking a certain control action under the current\nlandform/terrain condition perceived by visual sensors. In experiments, we\npre-train the MCRL4OR with a large-scale off-road driving dataset and adopt the\nlearned multimodal representations for various downstream perception tasks in\noff-road driving scenarios. The superior performance in downstream tasks\ndemonstrates the advantages of the pre-trained multimodal representations. The\ncodes can be found in \\url{https://github.com/1uciusy/MCRL4OR}.\n","authors":["Yi Yang","Zhang Zhang","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2501.13988v1.pdf","comment":"Github repository: https://github.com/1uciusy/MCRL4OR"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.13928v1","updated":"2025-01-23T18:59:55Z","published":"2025-01-23T18:59:55Z","title":"Fast3R: Towards 3D Reconstruction of 1000+ Images in One Forward Pass","summary":" Multi-view 3D reconstruction remains a core challenge in computer vision,\nparticularly in applications requiring accurate and scalable representations\nacross diverse perspectives. Current leading methods such as DUSt3R employ a\nfundamentally pairwise approach, processing images in pairs and necessitating\ncostly global alignment procedures to reconstruct from multiple views. In this\nwork, we propose Fast 3D Reconstruction (Fast3R), a novel multi-view\ngeneralization to DUSt3R that achieves efficient and scalable 3D reconstruction\nby processing many views in parallel. Fast3R's Transformer-based architecture\nforwards N images in a single forward pass, bypassing the need for iterative\nalignment. Through extensive experiments on camera pose estimation and 3D\nreconstruction, Fast3R demonstrates state-of-the-art performance, with\nsignificant improvements in inference speed and reduced error accumulation.\nThese results establish Fast3R as a robust alternative for multi-view\napplications, offering enhanced scalability without compromising reconstruction\naccuracy.\n","authors":["Jianing Yang","Alexander Sax","Kevin J. Liang","Mikael Henaff","Hao Tang","Ang Cao","Joyce Chai","Franziska Meier","Matt Feiszli"],"pdf_url":"https://arxiv.org/pdf/2501.13928v1.pdf","comment":"Project website: https://fast3r-3d.github.io/"},{"id":"http://arxiv.org/abs/2501.13927v1","updated":"2025-01-23T18:59:47Z","published":"2025-01-23T18:59:47Z","title":"CRPO: Confidence-Reward Driven Preference Optimization for Machine\n Translation","summary":" Large language models (LLMs) have shown great potential in natural language\nprocessing tasks, but their application to machine translation (MT) remains\nchallenging due to pretraining on English-centric data and the complexity of\nreinforcement learning from human feedback (RLHF). Direct Preference\nOptimization (DPO) has emerged as a simpler and more efficient alternative, but\nits performance depends heavily on the quality of preference data. To address\nthis, we propose Confidence-Reward driven Preference Optimization (CRPO), a\nnovel method that combines reward scores with model confidence to improve data\nselection for fine-tuning. CRPO selects challenging sentence pairs where the\nmodel is uncertain or underperforms, leading to more effective learning. While\nprimarily designed for LLMs, CRPO also generalizes to encoder-decoder models\nlike NLLB, demonstrating its versatility. Empirical results show that CRPO\noutperforms existing methods such as RS-DPO, RSO and MBR score in both\ntranslation accuracy and data efficiency.\n","authors":["Guofeng Cui","Pichao Wang","Yang Liu","Zemian Ke","Zhu Liu","Vimal Bhat"],"pdf_url":"https://arxiv.org/pdf/2501.13927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13926v1","updated":"2025-01-23T18:59:43Z","published":"2025-01-23T18:59:43Z","title":"Can We Generate Images with CoT? Let's Verify and Reinforce Image\n Generation Step by Step","summary":" Chain-of-Thought (CoT) reasoning has been extensively explored in large\nmodels to tackle complex understanding tasks. However, it still remains an open\nquestion whether such strategies can be applied to verifying and reinforcing\nimage generation scenarios. In this paper, we provide the first comprehensive\ninvestigation of the potential of CoT reasoning to enhance autoregressive image\ngeneration. We focus on three techniques: scaling test-time computation for\nverification, aligning model preferences with Direct Preference Optimization\n(DPO), and integrating these techniques for complementary effects. Our results\ndemonstrate that these approaches can be effectively adapted and combined to\nsignificantly improve image generation performance. Furthermore, given the\npivotal role of reward models in our findings, we propose the Potential\nAssessment Reward Model (PARM) and PARM++, specialized for autoregressive image\ngeneration. PARM adaptively assesses each generation step through a potential\nassessment approach, merging the strengths of existing reward models, and\nPARM++ further introduces a reflection mechanism to self-correct the generated\nunsatisfactory image. Using our investigated reasoning strategies, we enhance a\nbaseline model, Show-o, to achieve superior results, with a significant +24%\nimprovement on the GenEval benchmark, surpassing Stable Diffusion 3 by +15%. We\nhope our study provides unique insights and paves a new path for integrating\nCoT reasoning with autoregressive image generation. Code and models are\nreleased at https://github.com/ZiyuGuo99/Image-Generation-CoT\n","authors":["Ziyu Guo","Renrui Zhang","Chengzhuo Tong","Zhizheng Zhao","Peng Gao","Hongsheng Li","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2501.13926v1.pdf","comment":"Journal Version. Code and models are released at\n https://github.com/ZiyuGuo99/Image-Generation-CoT"},{"id":"http://arxiv.org/abs/2501.13924v1","updated":"2025-01-23T18:59:30Z","published":"2025-01-23T18:59:30Z","title":"Towards Robust Multimodal Open-set Test-time Adaptation via Adaptive\n Entropy-aware Optimization","summary":" Test-time adaptation (TTA) has demonstrated significant potential in\naddressing distribution shifts between training and testing data. Open-set\ntest-time adaptation (OSTTA) aims to adapt a source pre-trained model online to\nan unlabeled target domain that contains unknown classes. This task becomes\nmore challenging when multiple modalities are involved. Existing methods have\nprimarily focused on unimodal OSTTA, often filtering out low-confidence samples\nwithout addressing the complexities of multimodal data. In this work, we\npresent Adaptive Entropy-aware Optimization (AEO), a novel framework\nspecifically designed to tackle Multimodal Open-set Test-time Adaptation\n(MM-OSTTA) for the first time. Our analysis shows that the entropy difference\nbetween known and unknown samples in the target domain strongly correlates with\nMM-OSTTA performance. To leverage this, we propose two key components:\nUnknown-aware Adaptive Entropy Optimization (UAE) and Adaptive Modality\nPrediction Discrepancy Optimization (AMP). These components enhance the ability\nof model to distinguish unknown class samples during online adaptation by\namplifying the entropy difference between known and unknown samples. To\nthoroughly evaluate our proposed methods in the MM-OSTTA setting, we establish\na new benchmark derived from existing datasets. This benchmark includes two\ndownstream tasks and incorporates five modalities. Extensive experiments across\nvarious domain shift situations demonstrate the efficacy and versatility of the\nAEO framework. Additionally, we highlight the strong performance of AEO in\nlong-term and continual MM-OSTTA settings, both of which are challenging and\nhighly relevant to real-world applications. Our source code is available at\nhttps://github.com/donghao51/AEO.\n","authors":["Hao Dong","Eleni Chatzi","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2501.13924v1.pdf","comment":"Accepted by ICLR 2025"},{"id":"http://arxiv.org/abs/2501.13925v1","updated":"2025-01-23T18:59:30Z","published":"2025-01-23T18:59:30Z","title":"GeoPixel: Pixel Grounding Large Multimodal Model in Remote Sensing","summary":" Recent advances in large multimodal models (LMMs) have recognized\nfine-grained grounding as an imperative factor of visual understanding and\ndialogue. However, the benefits of such representation in LMMs are limited to\nthe natural image domain, and these models perform poorly for remote sensing\n(RS). The distinct overhead viewpoint, scale variation, and presence of small\nobjects in high-resolution RS imagery present a unique challenge in\nregion-level comprehension. Moreover, the development of the grounding\nconversation capability of LMMs within RS is hindered by the lack of granular,\nRS domain-specific grounded data. Addressing these limitations, we propose\nGeoPixel - the first end-to-end high resolution RS-LMM that supports\npixel-level grounding. This capability allows fine-grained visual perception by\ngenerating interleaved masks in conversation. GeoPixel supports up to 4K HD\nresolution in any aspect ratio, ideal for high-precision RS image analysis. To\nsupport the grounded conversation generation (GCG) in RS imagery, we curate a\nvisually grounded dataset GeoPixelD through a semi-automated pipeline that\nutilizes set-of-marks prompting and spatial priors tailored for RS data to\nmethodically control the data generation process. GeoPixel demonstrates\nsuperior performance in pixel-level comprehension, surpassing existing LMMs in\nboth single-target and multi-target segmentation tasks. Our methodological\nablation studies validate the effectiveness of each component in the overall\narchitecture. Our code and data will be publicly released.\n","authors":["Akashah Shabbir","Mohammed Zumri","Mohammed Bennamoun","Fahad S. Khan","Salman Khan"],"pdf_url":"https://arxiv.org/pdf/2501.13925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13920v1","updated":"2025-01-23T18:58:33Z","published":"2025-01-23T18:58:33Z","title":"IMAGINE-E: Image Generation Intelligence Evaluation of State-of-the-art\n Text-to-Image Models","summary":" With the rapid development of diffusion models, text-to-image(T2I) models\nhave made significant progress, showcasing impressive abilities in prompt\nfollowing and image generation. Recently launched models such as FLUX.1 and\nIdeogram2.0, along with others like Dall-E3 and Stable Diffusion 3, have\ndemonstrated exceptional performance across various complex tasks, raising\nquestions about whether T2I models are moving towards general-purpose\napplicability. Beyond traditional image generation, these models exhibit\ncapabilities across a range of fields, including controllable generation, image\nediting, video, audio, 3D, and motion generation, as well as computer vision\ntasks like semantic segmentation and depth estimation. However, current\nevaluation frameworks are insufficient to comprehensively assess these models'\nperformance across expanding domains. To thoroughly evaluate these models, we\ndeveloped the IMAGINE-E and tested six prominent models: FLUX.1, Ideogram2.0,\nMidjourney, Dall-E3, Stable Diffusion 3, and Jimeng. Our evaluation is divided\ninto five key domains: structured output generation, realism, and physical\nconsistency, specific domain generation, challenging scenario generation, and\nmulti-style creation tasks. This comprehensive assessment highlights each\nmodel's strengths and limitations, particularly the outstanding performance of\nFLUX.1 and Ideogram2.0 in structured and specific domain tasks, underscoring\nthe expanding applications and potential of T2I models as foundational AI\ntools. This study provides valuable insights into the current state and future\ntrajectory of T2I models as they evolve towards general-purpose usability.\nEvaluation scripts will be released at https://github.com/jylei16/Imagine-e.\n","authors":["Jiayi Lei","Renrui Zhang","Xiangfei Hu","Weifeng Lin","Zhen Li","Wenjian Sun","Ruoyi Du","Le Zhuo","Zhongyu Li","Xinyue Li","Shitian Zhao","Ziyu Guo","Yiting Lu","Peng Gao","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2501.13920v1.pdf","comment":"75 pages, 73 figures, Evaluation scripts:\n https://github.com/jylei16/Imagine-e"},{"id":"http://arxiv.org/abs/2501.13919v1","updated":"2025-01-23T18:58:03Z","published":"2025-01-23T18:58:03Z","title":"Temporal Preference Optimization for Long-Form Video Understanding","summary":" Despite significant advancements in video large multimodal models\n(video-LMMs), achieving effective temporal grounding in long-form videos\nremains a challenge for existing models. To address this limitation, we propose\nTemporal Preference Optimization (TPO), a novel post-training framework\ndesigned to enhance the temporal grounding capabilities of video-LMMs through\npreference learning. TPO adopts a self-training approach that enables models to\ndifferentiate between well-grounded and less accurate temporal responses by\nleveraging curated preference datasets at two granularities: localized temporal\ngrounding, which focuses on specific video segments, and comprehensive temporal\ngrounding, which captures extended temporal dependencies across entire video\nsequences. By optimizing on these preference datasets, TPO significantly\nenhances temporal understanding while reducing reliance on manually annotated\ndata. Extensive experiments on three long-form video understanding\nbenchmarks--LongVideoBench, MLVU, and Video-MME--demonstrate the effectiveness\nof TPO across two state-of-the-art video-LMMs. Notably, LLaVA-Video-TPO\nestablishes itself as the leading 7B model on the Video-MME benchmark,\nunderscoring the potential of TPO as a scalable and efficient solution for\nadvancing temporal reasoning in long-form video understanding. Project page:\nhttps://ruili33.github.io/tpo_website.\n","authors":["Rui Li","Xiaohan Wang","Yuhui Zhang","Zeyu Wang","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2501.13919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14895v2","updated":"2025-01-23T18:56:32Z","published":"2024-10-18T22:38:08Z","title":"Truncated Consistency Models","summary":" Consistency models have recently been introduced to accelerate sampling from\ndiffusion models by directly predicting the solution (i.e., data) of the\nprobability flow ODE (PF ODE) from initial noise. However, the training of\nconsistency models requires learning to map all intermediate points along PF\nODE trajectories to their corresponding endpoints. This task is much more\nchallenging than the ultimate objective of one-step generation, which only\nconcerns the PF ODE's noise-to-data mapping. We empirically find that this\ntraining paradigm limits the one-step generation performance of consistency\nmodels. To address this issue, we generalize consistency training to the\ntruncated time range, which allows the model to ignore denoising tasks at\nearlier time steps and focus its capacity on generation. We propose a new\nparameterization of the consistency function and a two-stage training procedure\nthat prevents the truncated-time training from collapsing to a trivial\nsolution. Experiments on CIFAR-10 and ImageNet $64\\times64$ datasets show that\nour method achieves better one-step and two-step FIDs than the state-of-the-art\nconsistency models such as iCT-deep, using more than 2$\\times$ smaller\nnetworks. Project page: https://truncated-cm.github.io/\n","authors":["Sangyun Lee","Yilun Xu","Tomas Geffner","Giulia Fanti","Karsten Kreis","Arash Vahdat","Weili Nie"],"pdf_url":"https://arxiv.org/pdf/2410.14895v2.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2501.13918v1","updated":"2025-01-23T18:55:41Z","published":"2025-01-23T18:55:41Z","title":"Improving Video Generation with Human Feedback","summary":" Video generation has achieved significant advances through rectified flow\ntechniques, but issues like unsmooth motion and misalignment between videos and\nprompts persist. In this work, we develop a systematic pipeline that harnesses\nhuman feedback to mitigate these problems and refine the video generation\nmodel. Specifically, we begin by constructing a large-scale human preference\ndataset focused on modern video generation models, incorporating pairwise\nannotations across multi-dimensions. We then introduce VideoReward, a\nmulti-dimensional video reward model, and examine how annotations and various\ndesign choices impact its rewarding efficacy. From a unified reinforcement\nlearning perspective aimed at maximizing reward with KL regularization, we\nintroduce three alignment algorithms for flow-based models by extending those\nfrom diffusion models. These include two training-time strategies: direct\npreference optimization for flow (Flow-DPO) and reward weighted regression for\nflow (Flow-RWR), and an inference-time technique, Flow-NRG, which applies\nreward guidance directly to noisy videos. Experimental results indicate that\nVideoReward significantly outperforms existing reward models, and Flow-DPO\ndemonstrates superior performance compared to both Flow-RWR and standard\nsupervised fine-tuning methods. Additionally, Flow-NRG lets users assign custom\nweights to multiple objectives during inference, meeting personalized video\nquality needs. Project page: https://gongyeliu.github.io/videoalign.\n","authors":["Jie Liu","Gongye Liu","Jiajun Liang","Ziyang Yuan","Xiaokun Liu","Mingwu Zheng","Xiele Wu","Qiulin Wang","Wenyu Qin","Menghan Xia","Xintao Wang","Xiaohong Liu","Fei Yang","Pengfei Wan","Di Zhang","Kun Gai","Yujiu Yang","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2501.13918v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13915v1","updated":"2025-01-23T18:52:47Z","published":"2025-01-23T18:52:47Z","title":"Binary Diffusion Probabilistic Model","summary":" We introduce the Binary Diffusion Probabilistic Model (BDPM), a novel\ngenerative model optimized for binary data representations. While denoising\ndiffusion probabilistic models (DDPMs) have demonstrated notable success in\ntasks like image synthesis and restoration, traditional DDPMs rely on\ncontinuous data representations and mean squared error (MSE) loss for training,\napplying Gaussian noise models that may not be optimal for discrete or binary\ndata structures. BDPM addresses this by decomposing images into bitplanes and\nemploying XOR-based noise transformations, with a denoising model trained using\nbinary cross-entropy loss. This approach enables precise noise control and\ncomputationally efficient inference, significantly lowering computational costs\nand improving model convergence. When evaluated on image restoration tasks such\nas image super-resolution, inpainting, and blind image restoration, BDPM\noutperforms state-of-the-art methods on the FFHQ, CelebA, and CelebA-HQ\ndatasets. Notably, BDPM requires fewer inference steps than traditional DDPM\nmodels to reach optimal results, showcasing enhanced inference efficiency.\n","authors":["Vitaliy Kinakh","Slava Voloshynovskiy"],"pdf_url":"https://arxiv.org/pdf/2501.13915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13898v1","updated":"2025-01-23T18:18:15Z","published":"2025-01-23T18:18:15Z","title":"PointOBB-v3: Expanding Performance Boundaries of Single Point-Supervised\n Oriented Object Detection","summary":" With the growing demand for oriented object detection (OOD), recent studies\non point-supervised OOD have attracted significant interest. In this paper, we\npropose PointOBB-v3, a stronger single point-supervised OOD framework. Compared\nto existing methods, it generates pseudo rotated boxes without additional\npriors and incorporates support for the end-to-end paradigm. PointOBB-v3\nfunctions by integrating three unique image views: the original view, a resized\nview, and a rotated/flipped (rot/flp) view. Based on the views, a scale\naugmentation module and an angle acquisition module are constructed. In the\nfirst module, a Scale-Sensitive Consistency (SSC) loss and a Scale-Sensitive\nFeature Fusion (SSFF) module are introduced to improve the model's ability to\nestimate object scale. To achieve precise angle predictions, the second module\nemploys symmetry-based self-supervised learning. Additionally, we introduce an\nend-to-end version that eliminates the pseudo-label generation process by\nintegrating a detector branch and introduces an Instance-Aware Weighting (IAW)\nstrategy to focus on high-quality predictions. We conducted extensive\nexperiments on the DIOR-R, DOTA-v1.0/v1.5/v2.0, FAIR1M, STAR, and RSAR\ndatasets. Across all these datasets, our method achieves an average improvement\nin accuracy of 3.56% in comparison to previous state-of-the-art methods. The\ncode will be available at https://github.com/ZpyWHU/PointOBB-v3.\n","authors":["Peiyuan Zhang","Junwei Luo","Xue Yang","Yi Yu","Qingyun Li","Yue Zhou","Xiaosong Jia","Xudong Lu","Jingdong Chen","Xiang Li","Junchi Yan","Yansheng Li"],"pdf_url":"https://arxiv.org/pdf/2501.13898v1.pdf","comment":"16 pages, 5 figures, 10 tables"},{"id":"http://arxiv.org/abs/2501.13896v1","updated":"2025-01-23T18:16:21Z","published":"2025-01-23T18:16:21Z","title":"GUI-Bee: Align GUI Action Grounding to Novel Environments via Autonomous\n Exploration","summary":" Graphical User Interface (GUI) action grounding is a critical step in GUI\nautomation that maps language instructions to actionable elements on GUI\nscreens. Most recent works of GUI action grounding leverage large GUI datasets\nto fine-tune MLLMs. However, the fine-tuning data always covers limited GUI\nenvironments, and we find the performance of the resulting model deteriorates\nin novel environments. We argue that the GUI grounding models should be further\naligned to the novel environments to reveal their full potential, when the\ninference is known to involve novel environments, i.e., environments not used\nduring the previous fine-tuning. To realize this, we first propose GUI-Bee, an\nMLLM-based autonomous agent, to collect high-quality, environment-specific data\nthrough exploration and then continuously fine-tune GUI grounding models with\nthe collected data. Our agent leverages a novel Q-value-Incentive In-Context\nReinforcement Learning (Q-ICRL) method to optimize exploration efficiency and\ndata quality. Additionally, we introduce NovelScreenSpot, a benchmark for\ntesting how well the data can help align GUI action grounding models to novel\nenvironments and demonstrate the effectiveness of data collected by GUI-Bee in\nthe experiments. Furthermore, we conduct an ablation study to validate the\nQ-ICRL method in enhancing the efficiency of GUI-Bee. Project page:\nhttps://gui-bee.github.io\n","authors":["Yue Fan","Handong Zhao","Ruiyi Zhang","Yu Shen","Xin Eric Wang","Gang Wu"],"pdf_url":"https://arxiv.org/pdf/2501.13896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13107v2","updated":"2025-01-23T18:13:35Z","published":"2025-01-22T18:59:58Z","title":"Accelerate High-Quality Diffusion Models with Inner Loop Feedback","summary":" We propose Inner Loop Feedback (ILF), a novel approach to accelerate\ndiffusion models' inference. ILF trains a lightweight module to predict future\nfeatures in the denoising process by leveraging the outputs from a chosen\ndiffusion backbone block at a given time step. This approach exploits two key\nintuitions; (1) the outputs of a given block at adjacent time steps are\nsimilar, and (2) performing partial computations for a step imposes a lower\nburden on the model than skipping the step entirely. Our method is highly\nflexible, since we find that the feedback module itself can simply be a block\nfrom the diffusion backbone, with all settings copied. Its influence on the\ndiffusion forward can be tempered with a learnable scaling factor from zero\ninitialization. We train this module using distillation losses; however, unlike\nsome prior work where a full diffusion backbone serves as the student, our\nmodel freezes the backbone, training only the feedback module. While many\nefforts to optimize diffusion models focus on achieving acceptable image\nquality in extremely few steps (1-4 steps), our emphasis is on matching best\ncase results (typically achieved in 20 steps) while significantly reducing\nruntime. ILF achieves this balance effectively, demonstrating strong\nperformance for both class-to-image generation with diffusion transformer (DiT)\nand text-to-image generation with DiT-based PixArt-alpha and PixArt-sigma. The\nquality of ILF's 1.7x-1.8x speedups are confirmed by FID, CLIP score, CLIP\nImage Quality Assessment, ImageReward, and qualitative comparisons. Project\ninformation is available at https://mgwillia.github.io/ilf.\n","authors":["Matthew Gwilliam","Han Cai","Di Wu","Abhinav Shrivastava","Zhiyu Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.13107v2.pdf","comment":"submission currently under review; 20 pages, 17 figures, 6 tables"},{"id":"http://arxiv.org/abs/2501.13893v1","updated":"2025-01-23T18:08:57Z","published":"2025-01-23T18:08:57Z","title":"Pix2Cap-COCO: Advancing Visual Comprehension via Pixel-Level Captioning","summary":" We present Pix2Cap-COCO, the first panoptic pixel-level caption dataset\ndesigned to advance fine-grained visual understanding. To achieve this, we\ncarefully design an automated annotation pipeline that prompts GPT-4V to\ngenerate pixel-aligned, instance-specific captions for individual objects\nwithin images, enabling models to learn more granular relationships between\nobjects and their contexts. This approach results in 167,254 detailed captions,\nwith an average of 22.94 words per caption. Building on Pix2Cap-COCO, we\nintroduce a novel task, panoptic segmentation-captioning, which challenges\nmodels to recognize instances in an image and provide detailed descriptions for\neach simultaneously. To benchmark this task, we design a robust baseline based\non X-Decoder. The experimental results demonstrate that Pix2Cap-COCO is a\nparticularly challenging dataset, as it requires models to excel in both\nfine-grained visual understanding and detailed language generation.\nFurthermore, we leverage Pix2Cap-COCO for Supervised Fine-Tuning (SFT) on large\nmultimodal models (LMMs) to enhance their performance. For example, training\nwith Pix2Cap-COCO significantly improves the performance of GPT4RoI, yielding\ngains in CIDEr +1.4%, ROUGE +0.4%, and SPICE +0.5% on Visual Genome dataset,\nand strengthens its region understanding ability on the ViP-BENCH, with an\noverall improvement of +5.1%, including notable increases in recognition\naccuracy +11.2% and language generation quality +22.2%.\n","authors":["Zuyao You","Junke Wang","Lingyu Kong","Bo He","Zuxuan Wu"],"pdf_url":"https://arxiv.org/pdf/2501.13893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13889v1","updated":"2025-01-23T18:01:19Z","published":"2025-01-23T18:01:19Z","title":"Generating Realistic Forehead-Creases for User Verification via\n Conditioned Piecewise Polynomial Curves","summary":" We propose a trait-specific image generation method that models forehead\ncreases geometrically using B-spline and B\\'ezier curves. This approach ensures\nthe realistic generation of both principal creases and non-prominent crease\npatterns, effectively constructing detailed and authentic forehead-crease\nimages. These geometrically rendered images serve as visual prompts for a\ndiffusion-based Edge-to-Image translation model, which generates corresponding\nmated samples. The resulting novel synthetic identities are then used to train\na forehead-crease verification network. To enhance intra-subject diversity in\nthe generated samples, we employ two strategies: (a) perturbing the control\npoints of B-splines under defined constraints to maintain label consistency,\nand (b) applying image-level augmentations to the geometric visual prompts,\nsuch as dropout and elastic transformations, specifically tailored to crease\npatterns. By integrating the proposed synthetic dataset with real-world data,\nour method significantly improves the performance of forehead-crease\nverification systems under a cross-database verification protocol.\n","authors":["Abhishek Tandon","Geetanjali Sharma","Gaurav Jaswal","Aditya Nigam","Raghavendra Ramachandra"],"pdf_url":"https://arxiv.org/pdf/2501.13889v1.pdf","comment":"Accepted at WACV-W 2025"},{"id":"http://arxiv.org/abs/2501.13888v1","updated":"2025-01-23T18:01:01Z","published":"2025-01-23T18:01:01Z","title":"Multimodal Sensor Dataset for Monitoring Older Adults Post Lower-Limb\n Fractures in Community Settings","summary":" Lower-Limb Fractures (LLF) are a major health concern for older adults, often\nleading to reduced mobility and prolonged recovery, potentially impairing daily\nactivities and independence. During recovery, older adults frequently face\nsocial isolation and functional decline, complicating rehabilitation and\nadversely affecting physical and mental health. Multi-modal sensor platforms\nthat continuously collect data and analyze it using machine-learning algorithms\ncan remotely monitor this population and infer health outcomes. They can also\nalert clinicians to individuals at risk of isolation and decline. This paper\npresents a new publicly available multi-modal sensor dataset, MAISON-LLF,\ncollected from older adults recovering from LLF in community settings. The\ndataset includes data from smartphone and smartwatch sensors, motion detectors,\nsleep-tracking mattresses, and clinical questionnaires on isolation and\ndecline. The dataset was collected from ten older adults living alone at home\nfor eight weeks each, totaling 560 days of 24-hour sensor data. For technical\nvalidation, supervised machine-learning and deep-learning models were developed\nusing the sensor and clinical questionnaire data, providing a foundational\ncomparison for the research community.\n","authors":["Ali Abedi","Charlene H. Chu","Shehroz S. Khan"],"pdf_url":"https://arxiv.org/pdf/2501.13888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13878v1","updated":"2025-01-23T17:51:54Z","published":"2025-01-23T17:51:54Z","title":"Eye Gaze as a Signal for Conveying User Attention in Contextual AI\n Systems","summary":" Advanced multimodal AI agents can now collaborate with users to solve\nchallenges in the world. We explore eye tracking's role in such interaction to\nconvey a user's attention relative to the physical environment. We hypothesize\nthat this knowledge improves contextual understanding for AI agents. By\nobserving hours of human-object interactions, we first measure the relationship\nbetween an eye tracker's signal quality and its ability to reliably place gaze\non nearby physical objects. We then conduct experiments which relay the user's\nscanpath history as additional context querying multimodal agents. Our results\nshow that eye tracking provides high value as a user attention signal and can\nconvey information about the user's current task and interests to the agent.\n","authors":["Ethan Wilson","Naveen Sendhilnathan","Charlie S. Burlingham","Yusuf Mansour","Robert Cavin","Sai Deep Tetali","Ajoy Savio Fernandes","Michael J. Proulx"],"pdf_url":"https://arxiv.org/pdf/2501.13878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13859v1","updated":"2025-01-23T17:30:27Z","published":"2025-01-23T17:30:27Z","title":"Dual-Modal Prototype Joint Learning for Compositional Zero-Shot Learning","summary":" Compositional Zero-Shot Learning (CZSL) aims to recognize novel compositions\nof attributes and objects by leveraging knowledge learned from seen\ncompositions. Recent approaches have explored the use of Vision-Language Models\n(VLMs) to align textual and visual modalities. These methods typically employ\nprompt engineering, parameter-tuning, and modality fusion to generate rich\ntextual prototypes that serve as class prototypes for CZSL. However, the\nmodality gap results in textual prototypes being unable to fully capture the\noptimal representations of all class prototypes, particularly those with\nfine-grained features, which can be directly obtained from the visual modality.\nIn this paper, we propose a novel Dual-Modal Prototype Joint Learning framework\nfor the CZSL task. Our approach, based on VLMs, introduces prototypes in both\nthe textual and visual modalities. The textual prototype is optimized to\ncapture broad conceptual information, aiding the model's generalization across\nunseen compositions. Meanwhile, the visual prototype is used to mitigate the\nclassification errors caused by the modality gap and capture fine-grained\ndetails to distinguish images with similar appearances. To effectively optimize\nthese prototypes, we design specialized decomposition modules and a joint\nlearning strategy that enrich the features from both modalities. These\nprototypes not only capture key category information during training but also\nserve as crucial reference targets during inference. Experimental results\ndemonstrate that our approach achieves state-of-the-art performance in the\nclosed-world setting and competitive performance in the open-world setting\nacross three publicly available CZSL benchmarks. These findings validate the\neffectiveness of our method in advancing compositional generalization.\n","authors":["Shiyu Zhang","Cheng Yan","Yang Liu","Chenchen Jing","Lei Zhou","Wenjun Wang"],"pdf_url":"https://arxiv.org/pdf/2501.13859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13855v1","updated":"2025-01-23T17:24:24Z","published":"2025-01-23T17:24:24Z","title":"First Lessons Learned of an Artificial Intelligence Robotic System for\n Autonomous Coarse Waste Recycling Using Multispectral Imaging-Based Methods","summary":" Current disposal facilities for coarse-grained waste perform manual sorting\nof materials with heavy machinery. Large quantities of recyclable materials are\nlost to coarse waste, so more effective sorting processes must be developed to\nrecover them. Two key aspects to automate the sorting process are object\ndetection with material classification in mixed piles of waste, and autonomous\ncontrol of hydraulic machinery. Because most objects in those accumulations of\nwaste are damaged or destroyed, object detection alone is not feasible in the\nmajority of cases. To address these challenges, we propose a classification of\nmaterials with multispectral images of ultraviolet (UV), visual (VIS), near\ninfrared (NIR), and short-wave infrared (SWIR) spectrums. Solution for\nautonomous control of hydraulic heavy machines for sorting of bulky waste is\nbeing investigated using cost-effective cameras and artificial\nintelligence-based controllers.\n","authors":["Timo Lange","Ajish Babu","Philipp Meyer","Matthis Keppner","Tim Tiedemann","Martin Wittmaier","Sebastian Wolff","Thomas Vögele"],"pdf_url":"https://arxiv.org/pdf/2501.13855v1.pdf","comment":"Published in Proceedings of Sardinia 2023, 19th International\n Symposium on Waste Management, Resource Recovery and Sustainable Landfilling"},{"id":"http://arxiv.org/abs/2410.11610v4","updated":"2025-01-23T17:18:07Z","published":"2024-10-15T13:46:19Z","title":"Enhanced Encoder-Decoder Architecture for Accurate Monocular Depth\n Estimation","summary":" Estimating depth from a single 2D image is a challenging task due to the lack\nof stereo or multi-view data, which are typically required for depth\nperception. In state-of-the-art architectures, the main challenge is to\nefficiently capture complex objects and fine-grained details, which are often\ndifficult to predict. This paper introduces a novel deep learning-based\napproach using an enhanced encoder-decoder architecture, where the\nInception-ResNet-v2 model serves as the encoder. This is the first instance of\nutilizing Inception-ResNet-v2 as an encoder for monocular depth estimation,\ndemonstrating improved performance over previous models. It incorporates\nmulti-scale feature extraction to enhance depth prediction accuracy across\nvarious object sizes and distances. We propose a composite loss function\ncomprising depth loss, gradient edge loss, and Structural Similarity Index\nMeasure (SSIM) loss, with fine-tuned weights to optimize the weighted sum,\nensuring a balance across different aspects of depth estimation. Experimental\nresults on the KITTI dataset show that our model achieves a significantly\nfaster inference time of 0.019 seconds, outperforming vision transformers in\nefficiency while maintaining good accuracy. On the NYU Depth V2 dataset, the\nmodel establishes state-of-the-art performance, with an Absolute Relative Error\n(ARE) of 0.064, a Root Mean Square Error (RMSE) of 0.228, and an accuracy of\n89.3% for $\\delta$ < 1.25. These metrics demonstrate that our model can\naccurately and efficiently predict depth even in challenging scenarios,\nproviding a practical solution for real-time applications.\n","authors":["Dabbrata Das","Argho Deb Das","Farhan Sadaf"],"pdf_url":"https://arxiv.org/pdf/2410.11610v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13848v1","updated":"2025-01-23T17:15:26Z","published":"2025-01-23T17:15:26Z","title":"Where Do You Go? Pedestrian Trajectory Prediction using Scene Features","summary":" Accurate prediction of pedestrian trajectories is crucial for enhancing the\nsafety of autonomous vehicles and reducing traffic fatalities involving\npedestrians. While numerous studies have focused on modeling interactions among\npedestrians to forecast their movements, the influence of environmental factors\nand scene-object placements has been comparatively underexplored. In this\npaper, we present a novel trajectory prediction model that integrates both\npedestrian interactions and environmental context to improve prediction\naccuracy. Our approach captures spatial and temporal interactions among\npedestrians within a sparse graph framework. To account for pedestrian-scene\ninteractions, we employ advanced image enhancement and semantic segmentation\ntechniques to extract detailed scene features. These scene and interaction\nfeatures are then fused through a cross-attention mechanism, enabling the model\nto prioritize relevant environmental factors that influence pedestrian\nmovements. Finally, a temporal convolutional network processes the fused\nfeatures to predict future pedestrian trajectories. Experimental results\ndemonstrate that our method significantly outperforms existing state-of-the-art\napproaches, achieving ADE and FDE values of 0.252 and 0.372 meters,\nrespectively, underscoring the importance of incorporating both social\ninteractions and environmental context in pedestrian trajectory prediction.\n","authors":["Mohammad Ali Rezaei","Fardin Ayar","Ehsan Javanmardi","Manabu Tsukada","Mahdi Javanmardi"],"pdf_url":"https://arxiv.org/pdf/2501.13848v1.pdf","comment":"Accepted by 2024 International Conference on Intelligent Computing\n and its Emerging Applications"},{"id":"http://arxiv.org/abs/2410.08159v2","updated":"2025-01-23T17:08:57Z","published":"2024-10-10T17:41:54Z","title":"DART: Denoising Autoregressive Transformer for Scalable Text-to-Image\n Generation","summary":" Diffusion models have become the dominant approach for visual generation.\nThey are trained by denoising a Markovian process which gradually adds noise to\nthe input. We argue that the Markovian property limits the model's ability to\nfully utilize the generation trajectory, leading to inefficiencies during\ntraining and inference. In this paper, we propose DART, a transformer-based\nmodel that unifies autoregressive (AR) and diffusion within a non-Markovian\nframework. DART iteratively denoises image patches spatially and spectrally\nusing an AR model that has the same architecture as standard language models.\nDART does not rely on image quantization, which enables more effective image\nmodeling while maintaining flexibility. Furthermore, DART seamlessly trains\nwith both text and image data in a unified model. Our approach demonstrates\ncompetitive performance on class-conditioned and text-to-image generation\ntasks, offering a scalable, efficient alternative to traditional diffusion\nmodels. Through this unified framework, DART sets a new benchmark for scalable,\nhigh-quality image synthesis.\n","authors":["Jiatao Gu","Yuyang Wang","Yizhe Zhang","Qihang Zhang","Dinghuai Zhang","Navdeep Jaitly","Josh Susskind","Shuangfei Zhai"],"pdf_url":"https://arxiv.org/pdf/2410.08159v2.pdf","comment":"Accepted by ICLR2025"},{"id":"http://arxiv.org/abs/2501.13829v1","updated":"2025-01-23T16:53:46Z","published":"2025-01-23T16:53:46Z","title":"MV-GMN: State Space Model for Multi-View Action Recognition","summary":" Recent advancements in multi-view action recognition have largely relied on\nTransformer-based models. While effective and adaptable, these models often\nrequire substantial computational resources, especially in scenarios with\nmultiple views and multiple temporal sequences. Addressing this limitation,\nthis paper introduces the MV-GMN model, a state-space model specifically\ndesigned to efficiently aggregate multi-modal data (RGB and skeleton),\nmulti-view perspectives, and multi-temporal information for action recognition\nwith reduced computational complexity. The MV-GMN model employs an innovative\nMulti-View Graph Mamba network comprising a series of MV-GMN blocks. Each block\nincludes a proposed Bidirectional State Space Block and a GCN module. The\nBidirectional State Space Block introduces four scanning strategies, including\nview-prioritized and time-prioritized approaches. The GCN module leverages\nrule-based and KNN-based methods to construct the graph network, effectively\nintegrating features from different viewpoints and temporal instances.\nDemonstrating its efficacy, MV-GMN outperforms the state-of-the-arts on several\ndatasets, achieving notable accuracies of 97.3\\% and 96.7\\% on the NTU RGB+D\n120 dataset in cross-subject and cross-view scenarios, respectively. MV-GMN\nalso surpasses Transformer-based baselines while requiring only linear\ninference complexity, underscoring the model's ability to reduce computational\nload and enhance the scalability and applicability of multi-view action\nrecognition technologies.\n","authors":["Yuhui Lin","Jiaxuan Lu","Yue Yong","Jiahao Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.13829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13826v1","updated":"2025-01-23T16:51:47Z","published":"2025-01-23T16:51:47Z","title":"Video-MMMU: Evaluating Knowledge Acquisition from Multi-Discipline\n Professional Videos","summary":" Humans acquire knowledge through three cognitive stages: perceiving\ninformation, comprehending knowledge, and adapting knowledge to solve novel\nproblems. Videos serve as an effective medium for this learning process,\nfacilitating a progression through these cognitive stages. However, existing\nvideo benchmarks fail to systematically evaluate the knowledge acquisition\ncapabilities in Large Multimodal Models (LMMs). To address this gap, we\nintroduce Video-MMMU, a multi-modal, multi-disciplinary benchmark designed to\nassess LMMs' ability to acquire and utilize knowledge from videos. Video-MMMU\nfeatures a curated collection of 300 expert-level videos and 900\nhuman-annotated questions across six disciplines, evaluating knowledge\nacquisition through stage-aligned question-answer pairs: Perception,\nComprehension, and Adaptation. A proposed knowledge gain metric,\n{\\Delta}knowledge, quantifies improvement in performance after video viewing.\nEvaluation of LMMs reveals a steep decline in performance as cognitive demands\nincrease and highlights a significant gap between human and model knowledge\nacquisition, underscoring the need for methods to enhance LMMs' capability to\nlearn and adapt from videos.\n","authors":["Kairui Hu","Penghao Wu","Fanyi Pu","Wang Xiao","Yuanhan Zhang","Xiang Yue","Bo Li","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.13826v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13818v1","updated":"2025-01-23T16:39:09Z","published":"2025-01-23T16:39:09Z","title":"Ensuring Medical AI Safety: Explainable AI-Driven Detection and\n Mitigation of Spurious Model Behavior and Associated Data","summary":" Deep neural networks are increasingly employed in high-stakes medical\napplications, despite their tendency for shortcut learning in the presence of\nspurious correlations, which can have potentially fatal consequences in\npractice. Detecting and mitigating shortcut behavior is a challenging task that\noften requires significant labeling efforts from domain experts. To alleviate\nthis problem, we introduce a semi-automated framework for the identification of\nspurious behavior from both data and model perspective by leveraging insights\nfrom eXplainable Artificial Intelligence (XAI). This allows the retrieval of\nspurious data points and the detection of model circuits that encode the\nassociated prediction rules. Moreover, we demonstrate how these shortcut\nencodings can be used for XAI-based sample- and pixel-level data annotation,\nproviding valuable information for bias mitigation methods to unlearn the\nundesired shortcut behavior. We show the applicability of our framework using\nfour medical datasets across two modalities, featuring controlled and\nreal-world spurious correlations caused by data artifacts. We successfully\nidentify and mitigate these biases in VGG16, ResNet50, and contemporary Vision\nTransformer models, ultimately increasing their robustness and applicability\nfor real-world medical tasks.\n","authors":["Frederik Pahde","Thomas Wiegand","Sebastian Lapuschkin","Wojciech Samek"],"pdf_url":"https://arxiv.org/pdf/2501.13818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13812v1","updated":"2025-01-23T16:35:21Z","published":"2025-01-23T16:35:21Z","title":"By-Example Synthesis of Vector Textures","summary":" We propose a new method for synthesizing an arbitrarily sized novel vector\ntexture given a single raster exemplar. Our method first segments the exemplar\nto extract the primary textons, and then clusters them based on visual\nsimilarity. We then compute a descriptor to capture each texton's neighborhood\nwhich contains the inter-category relationships that are used at synthesis\ntime. Next, we use a simple procedure to both extract and place the secondary\ntextons behind the primary polygons. Finally, our method constructs a gradient\nfield for the background which is defined by a set of data points and colors.\nThe color of the secondary polygons are also adjusted to better match the\ngradient field. To compare our work with other methods, we use a wide range of\nperceptual-based metrics.\n","authors":["Christopher Palazzolo","Oliver van Kaick","David Mould"],"pdf_url":"https://arxiv.org/pdf/2501.13812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12574v4","updated":"2025-01-23T16:31:49Z","published":"2024-08-22T17:41:45Z","title":"MuMA-ToM: Multi-modal Multi-Agent Theory of Mind","summary":" Understanding people's social interactions in complex real-world scenarios\noften relies on intricate mental reasoning. To truly understand how and why\npeople interact with one another, we must infer the underlying mental states\nthat give rise to the social interactions, i.e., Theory of Mind reasoning in\nmulti-agent interactions. Additionally, social interactions are often\nmulti-modal -- we can watch people's actions, hear their conversations, and/or\nread about their past behaviors. For AI systems to successfully and safely\ninteract with people in real-world environments, they also need to understand\npeople's mental states as well as their inferences about each other's mental\nstates based on multi-modal information about their interactions. For this, we\nintroduce MuMA-ToM, a Multi-modal Multi-Agent Theory of Mind benchmark.\nMuMA-ToM is the first multi-modal Theory of Mind benchmark that evaluates\nmental reasoning in embodied multi-agent interactions. In MuMA-ToM, we provide\nvideo and text descriptions of people's multi-modal behavior in realistic\nhousehold environments. Based on the context, we then ask questions about\npeople's goals, beliefs, and beliefs about others' goals. We validated MuMA-ToM\nin a human experiment and provided a human baseline. We also proposed a novel\nmulti-modal, multi-agent ToM model, LIMP (Language model-based Inverse\nMulti-agent Planning). Our experimental results show that LIMP significantly\noutperforms state-of-the-art methods, including large multi-modal models (e.g.,\nGPT-4o, Gemini-1.5 Pro) and a recent multi-modal ToM model, BIP-ALM.\n","authors":["Haojun Shi","Suyu Ye","Xinyu Fang","Chuanyang Jin","Leyla Isik","Yen-Ling Kuo","Tianmin Shu"],"pdf_url":"https://arxiv.org/pdf/2408.12574v4.pdf","comment":"AAAI-25 (Oral). Project website:\n https://scai.cs.jhu.edu/projects/MuMA-ToM/ Code:\n https://github.com/SCAI-JHU/MuMA-ToM"},{"id":"http://arxiv.org/abs/2501.13805v1","updated":"2025-01-23T16:25:08Z","published":"2025-01-23T16:25:08Z","title":"EgoHand: Ego-centric Hand Pose Estimation and Gesture Recognition with\n Head-mounted Millimeter-wave Radar and IMUs","summary":" Recent advanced Virtual Reality (VR) headsets, such as the Apple Vision Pro,\nemploy bottom-facing cameras to detect hand gestures and inputs, which offers\nusers significant convenience in VR interactions. However, these bottom-facing\ncameras can sometimes be inconvenient and pose a risk of unintentionally\nexposing sensitive information, such as private body parts or personal\nsurroundings. To mitigate these issues, we introduce EgoHand. This system\nprovides an alternative solution by integrating millimeter-wave radar and IMUs\nfor hand gesture recognition, thereby offering users an additional option for\ngesture interaction that enhances privacy protection. To accurately recognize\nhand gestures, we devise a two-stage skeleton-based gesture recognition scheme.\nIn the first stage, a novel end-to-end Transformer architecture is employed to\nestimate the coordinates of hand joints. Subsequently, these estimated joint\ncoordinates are utilized for gesture recognition. Extensive experiments\ninvolving 10 subjects show that EgoHand can detect hand gestures with 90.8%\naccuracy. Furthermore, EgoHand demonstrates robust performance across a variety\nof cross-domain tests, including different users, dominant hands, body\npostures, and scenes.\n","authors":["Yizhe Lv","Tingting Zhang","Yunpeng Song","Han Ding","Jinsong Han","Fei Wang"],"pdf_url":"https://arxiv.org/pdf/2501.13805v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.00409v2","updated":"2025-01-23T16:23:37Z","published":"2024-03-30T16:35:38Z","title":"3DGSR: Implicit Surface Reconstruction with 3D Gaussian Splatting","summary":" In this paper, we present an implicit surface reconstruction method with 3D\nGaussian Splatting (3DGS), namely 3DGSR, that allows for accurate 3D\nreconstruction with intricate details while inheriting the high efficiency and\nrendering quality of 3DGS. The key insight is incorporating an implicit signed\ndistance field (SDF) within 3D Gaussians to enable them to be aligned and\njointly optimized. First, we introduce a differentiable SDF-to-opacity\ntransformation function that converts SDF values into corresponding Gaussians'\nopacities. This function connects the SDF and 3D Gaussians, allowing for\nunified optimization and enforcing surface constraints on the 3D Gaussians.\nDuring learning, optimizing the 3D Gaussians provides supervisory signals for\nSDF learning, enabling the reconstruction of intricate details. However, this\nonly provides sparse supervisory signals to the SDF at locations occupied by\nGaussians, which is insufficient for learning a continuous SDF. Then, to\naddress this limitation, we incorporate volumetric rendering and align the\nrendered geometric attributes (depth, normal) with those derived from 3D\nGaussians. This consistency regularization introduces supervisory signals to\nlocations not covered by discrete 3D Gaussians, effectively eliminating\nredundant surfaces outside the Gaussian sampling range. Our extensive\nexperimental results demonstrate that our 3DGSR method enables high-quality 3D\nsurface reconstruction while preserving the efficiency and rendering quality of\n3DGS. Besides, our method competes favorably with leading surface\nreconstruction techniques while offering a more efficient learning process and\nmuch better rendering qualities. The code will be available at\nhttps://github.com/CVMI-Lab/3DGSR.\n","authors":["Xiaoyang Lyu","Yang-Tian Sun","Yi-Hua Huang","Xiuzhe Wu","Ziyi Yang","Yilun Chen","Jiangmiao Pang","Xiaojuan Qi"],"pdf_url":"https://arxiv.org/pdf/2404.00409v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13796v1","updated":"2025-01-23T16:14:02Z","published":"2025-01-23T16:14:02Z","title":"PromptMono: Cross Prompting Attention for Self-Supervised Monocular\n Depth Estimation in Challenging Environments","summary":" Considerable efforts have been made to improve monocular depth estimation\nunder ideal conditions. However, in challenging environments, monocular depth\nestimation still faces difficulties. In this paper, we introduce visual prompt\nlearning for predicting depth across different environments within a unified\nmodel, and present a self-supervised learning framework called PromptMono. It\nemploys a set of learnable parameters as visual prompts to capture\ndomain-specific knowledge. To integrate prompting information into image\nrepresentations, a novel gated cross prompting attention (GCPA) module is\nproposed, which enhances the depth estimation in diverse conditions. We\nevaluate the proposed PromptMono on the Oxford Robotcar dataset and the\nnuScenes dataset. Experimental results demonstrate the superior performance of\nthe proposed method.\n","authors":["Changhao Wang","Guanwen Zhang","Zhengyun Cheng","Wei Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.13796v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2501.13795v1","updated":"2025-01-23T16:13:58Z","published":"2025-01-23T16:13:58Z","title":"Training-Free Zero-Shot Temporal Action Detection with Vision-Language\n Models","summary":" Existing zero-shot temporal action detection (ZSTAD) methods predominantly\nuse fully supervised or unsupervised strategies to recognize unseen activities.\nHowever, these training-based methods are prone to domain shifts and require\nhigh computational costs, which hinder their practical applicability in\nreal-world scenarios. In this paper, unlike previous works, we propose a\ntraining-Free Zero-shot temporal Action Detection (FreeZAD) method, leveraging\nexisting vision-language (ViL) models to directly classify and localize unseen\nactivities within untrimmed videos without any additional fine-tuning or\nadaptation. We mitigate the need for explicit temporal modeling and reliance on\npseudo-label quality by designing the LOGarithmic decay weighted\nOuter-Inner-Contrastive Score (LogOIC) and frequency-based Actionness\nCalibration. Furthermore, we introduce a test-time adaptation (TTA) strategy\nusing Prototype-Centric Sampling (PCS) to expand FreeZAD, enabling ViL models\nto adapt more effectively for ZSTAD. Extensive experiments on the THUMOS14 and\nActivityNet-1.3 datasets demonstrate that our training-free method outperforms\nstate-of-the-art unsupervised methods while requiring only 1/13 of the runtime.\nWhen equipped with TTA, the enhanced method further narrows the gap with fully\nsupervised methods.\n","authors":["Chaolei Han","Hongsong Wang","Jidong Kuang","Lei Zhang","Jie Gui"],"pdf_url":"https://arxiv.org/pdf/2501.13795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10600v3","updated":"2025-01-23T16:02:29Z","published":"2023-12-17T04:26:42Z","title":"How to Efficiently Annotate Images for Best-Performing Deep Learning\n Based Segmentation Models: An Empirical Study with Weak and Noisy Annotations\n and Segment Anything Model","summary":" Deep neural networks (DNNs) have demonstrated exceptional performance across\nvarious image segmentation tasks. However, the process of preparing datasets\nfor training segmentation DNNs is both labor-intensive and costly, as it\ntypically requires pixel-level annotations for each object of interest. To\nmitigate this challenge, alternative approaches such as using weak labels\n(e.g., bounding boxes or scribbles) or less precise (noisy) annotations can be\nemployed. Noisy and weak labels are significantly quicker to generate, allowing\nfor more annotated images within the same time frame. However, the potential\ndecrease in annotation quality may adversely impact the segmentation\nperformance of the resulting model. In this study, we conducted a comprehensive\ncost-effectiveness evaluation on six variants of annotation strategies (9~10\nsub-variants in total) across 4 datasets and conclude that the common practice\nof precisely outlining objects of interest is virtually never the optimal\napproach when annotation budget is limited. Both noisy and weak annotations\nshowed usage cases that yield similar performance to the perfectly annotated\ncounterpart, yet had significantly better cost-effectiveness. We hope our\nfindings will help researchers be aware of the different available options and\nuse their annotation budgets more efficiently, especially in cases where\naccurately acquiring labels for target objects is particularly costly. Our code\nwill be made available on https://github.com/yzluka/AnnotationEfficiency2D.\n","authors":["Yixin Zhang","Shen Zhao","Hanxue Gu","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2312.10600v3.pdf","comment":"Supplemental information is in appendix"},{"id":"http://arxiv.org/abs/2407.05765v2","updated":"2025-01-23T15:42:16Z","published":"2024-07-08T09:16:42Z","title":"Invariance Principle Meets Vicinal Risk Minimization","summary":" Deep learning models excel in computer vision tasks but often fail to\ngeneralize to out-of-distribution (OOD) domains. Invariant Risk Minimization\n(IRM) aims to address OOD generalization by learning domain-invariant features.\nHowever, IRM struggles with datasets exhibiting significant diversity shifts.\nWhile data augmentation methods like Mixup and Semantic Data Augmentation (SDA)\nenhance diversity, they risk over-augmentation and label instability. To\naddress these challenges, we propose a domain-shared Semantic Data Augmentation\n(SDA) module, a novel implementation of Variance Risk Minimization (VRM)\ndesigned to enhance dataset diversity while maintaining label consistency. We\nfurther provide a Rademacher complexity analysis, establishing a tighter\ngeneralization error bound compared to baseline methods. Extensive evaluations\non OOD benchmarks, including PACS, VLCS, OfficeHome, and TerraIncognita,\ndemonstrate consistent performance improvements over state-of-the-art domain\ngeneralization methods.\n","authors":["Yaoyao Zhu","Xiuding Cai","Yingkai Wang","Dong Miao","Zhongliang Fu","Xu Luo"],"pdf_url":"https://arxiv.org/pdf/2407.05765v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13756v1","updated":"2025-01-23T15:35:15Z","published":"2025-01-23T15:35:15Z","title":"Solving the long-tailed distribution problem by exploiting the synergies\n and balance of different techniques","summary":" In real-world data, long-tailed data distribution is common, making it\nchallenging for models trained on empirical risk minimisation to learn and\nclassify tail classes effectively. While many studies have sought to improve\nlong tail recognition by altering the data distribution in the feature space\nand adjusting model decision boundaries, research on the synergy and corrective\napproach among various methods is limited. Our study delves into three\nlong-tail recognition techniques: Supervised Contrastive Learning (SCL),\nRare-Class Sample Generator (RSG), and Label-Distribution-Aware Margin Loss\n(LDAM). SCL enhances intra-class clusters based on feature similarity and\npromotes clear inter-class separability but tends to favour dominant classes\nonly. When RSG is integrated into the model, we observed that the intra-class\nfeatures further cluster towards the class centre, which demonstrates a\nsynergistic effect together with SCL's principle of enhancing intra-class\nclustering. RSG generates new tail features and compensates for the tail\nfeature space squeezed by SCL. Similarly, LDAM is known to introduce a larger\nmargin specifically for tail classes; we demonstrate that LDAM further bolsters\nthe model's performance on tail classes when combined with the more explicit\ndecision boundaries achieved by SCL and RSG. Furthermore, SCL can compensate\nfor the dominant class accuracy sacrificed by RSG and LDAM. Our research\nemphasises the synergy and balance among the three techniques, with each\namplifying the strengths of the others and mitigating their shortcomings. Our\nexperiment on long-tailed distribution datasets, using an end-to-end\narchitecture, yields competitive results by enhancing tail class accuracy\nwithout compromising dominant class performance, achieving a balanced\nimprovement across all classes.\n","authors":["Ziheng Wang","Toni Lassila","Sharib Ali"],"pdf_url":"https://arxiv.org/pdf/2501.13756v1.pdf","comment":"13"},{"id":"http://arxiv.org/abs/2501.13751v1","updated":"2025-01-23T15:32:06Z","published":"2025-01-23T15:32:06Z","title":"On Disentangled Training for Nonlinear Transform in Learned Image\n Compression","summary":" Learned image compression (LIC) has demonstrated superior rate-distortion\n(R-D) performance compared to traditional codecs, but is challenged by training\ninefficiency that could incur more than two weeks to train a state-of-the-art\nmodel from scratch. Existing LIC methods overlook the slow convergence caused\nby compacting energy in learning nonlinear transforms. In this paper, we first\nreveal that such energy compaction consists of two components, i.e., feature\ndecorrelation and uneven energy modulation. On such basis, we propose a linear\nauxiliary transform (AuxT) to disentangle energy compaction in training\nnonlinear transforms. The proposed AuxT obtains coarse approximation to achieve\nefficient energy compaction such that distribution fitting with the nonlinear\ntransforms can be simplified to fine details. We then develop wavelet-based\nlinear shortcuts (WLSs) for AuxT that leverages wavelet-based downsampling and\northogonal linear projection for feature decorrelation and subband-aware\nscaling for uneven energy modulation. AuxT is lightweight and plug-and-play to\nbe integrated into diverse LIC models to address the slow convergence issue.\nExperimental results demonstrate that the proposed approach can accelerate\ntraining of LIC models by 2 times and simultaneously achieves an average 1\\%\nBD-rate reduction. To our best knowledge, this is one of the first successful\nattempt that can significantly improve the convergence of LIC with comparable\nor superior rate-distortion performance. Code will be released at\n\\url{https://github.com/qingshi9974/AuxT}\n","authors":["Han Li","Shaohui Li","Wenrui Dai","Maida Cao","Nuowen Kan","Chenglin Li","Junni Zou","Hongkai Xiong"],"pdf_url":"https://arxiv.org/pdf/2501.13751v1.pdf","comment":"Accepted by ICLR2025"},{"id":"http://arxiv.org/abs/2412.02856v2","updated":"2025-01-23T15:09:35Z","published":"2024-12-03T21:43:11Z","title":"Is Large-Scale Pretraining the Secret to Good Domain Generalization?","summary":" Multi-Source Domain Generalization (DG) is the task of training on multiple\nsource domains and achieving high classification performance on unseen target\ndomains. Recent methods combine robust features from web-scale pretrained\nbackbones with new features learned from source data, and this has dramatically\nimproved benchmark results. However, it remains unclear if DG finetuning\nmethods are becoming better over time, or if improved benchmark performance is\nsimply an artifact of stronger pre-training. Prior studies have shown that\nperceptual similarity to pre-training data correlates with zero-shot\nperformance, but we find the effect limited in the DG setting. Instead, we\nposit that having perceptually similar data in pretraining is not enough; and\nthat it is how well these data were learned that determines performance. This\nleads us to introduce the Alignment Hypothesis, which states that the final DG\nperformance will be high if and only if alignment of image and class label text\nembeddings is high. Our experiments confirm the Alignment Hypothesis is true,\nand we use it as an analysis tool of existing DG methods evaluated on DomainBed\ndatasets by splitting evaluation data into In-pretraining (IP) and\nOut-of-pretraining (OOP). We show that all evaluated DG methods struggle on\nDomainBed-OOP, while recent methods excel on DomainBed-IP. Put together, our\nfindings highlight the need for DG methods which can generalize beyond\npretraining alignment.\n","authors":["Piotr Teterwak","Kuniaki Saito","Theodoros Tsiligkaridis","Bryan A. Plummer","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2412.02856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13725v1","updated":"2025-01-23T14:58:49Z","published":"2025-01-23T14:58:49Z","title":"You Only Crash Once v2: Perceptually Consistent Strong Features for\n One-Stage Domain Adaptive Detection of Space Terrain","summary":" The in-situ detection of planetary, lunar, and small-body surface terrain is\ncrucial for autonomous spacecraft applications, where learning-based computer\nvision methods are increasingly employed to enable intelligence without prior\ninformation or human intervention. However, many of these methods remain\ncomputationally expensive for spacecraft processors and prevent real-time\noperation. Training of such algorithms is additionally complex due to the\nscarcity of labeled data and reliance on supervised learning approaches.\nUnsupervised Domain Adaptation (UDA) offers a promising solution by\nfacilitating model training with disparate data sources such as simulations or\nsynthetic scenes, although UDA is difficult to apply to celestial environments\nwhere challenging feature spaces are paramount. To alleviate such issues, You\nOnly Crash Once (YOCOv1) has studied the integration of Visual Similarity-based\nAlignment (VSA) into lightweight one-stage object detection architectures to\nimprove space terrain UDA. Although proven effective, the approach faces\nnotable limitations, including performance degradations in multi-class and\nhigh-altitude scenarios. Building upon the foundation of YOCOv1, we propose\nnovel additions to the VSA scheme that enhance terrain detection capabilities\nunder UDA, and our approach is evaluated across both simulated and real-world\ndata. Our second YOCO rendition, YOCOv2, is capable of achieving\nstate-of-the-art UDA performance on surface terrain detection, where we\nshowcase improvements upwards of 31% compared with YOCOv1 and terrestrial\nstate-of-the-art. We demonstrate the practical utility of YOCOv2 with\nspacecraft flight hardware performance benchmarking and qualitative evaluation\nof NASA mission data.\n","authors":["Timothy Chase Jr","Christopher Wilson","Karthik Dantu"],"pdf_url":"https://arxiv.org/pdf/2501.13725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11327v2","updated":"2025-01-23T14:50:47Z","published":"2024-06-17T08:39:16Z","title":"ClawMachine: Learning to Fetch Visual Tokens for Referential\n Comprehension","summary":" Aligning vision and language concepts at a finer level remains an essential\ntopic of multimodal large language models (MLLMs), particularly for tasks such\nas referring and grounding. Existing methods, such as proxy encoding and\ngeometry encoding, incorporate additional syntax to encode spatial information,\nimposing extra burdens when communicating between language and vision modules.\nIn this study, we propose ClawMachine, offering a new methodology that\nexplicitly notates each entity using token collectives groups of visual tokens\nthat collaboratively represent higher level semantics. A hybrid perception\nmechanism is also explored to perceive and understand scenes from both discrete\nand continuous spaces. Our method unifies the prompt and answer of visual\nreferential tasks without using additional syntax. By leveraging a joint\nvision-language vocabulary, ClawMachine further integrates referring and\ngrounding in an auto-regressive manner, demonstrating great potential with\nscaled-up pre-training data. Experiments show that ClawMachine achieves\nsuperior performance on scene-level and referential understanding tasks with\nhigher efficiency. It also exhibits the potential to integrate multi-source\ninformation for complex visual reasoning, which is beyond the capability of\nmany MLLMs. Our code is available at github.com/martian422/ClawMachine.\n","authors":["Tianren Ma","Lingxi Xie","Yunjie Tian","Boyu Yang","Qixiang Ye"],"pdf_url":"https://arxiv.org/pdf/2406.11327v2.pdf","comment":"ICLR 2025. Code is available at github.com/martian422/ClawMachine"},{"id":"http://arxiv.org/abs/2407.02272v2","updated":"2025-01-23T14:50:20Z","published":"2024-07-02T14:01:59Z","title":"Aligning Human Motion Generation with Human Perceptions","summary":" Human motion generation is a critical task with a wide range of applications.\nAchieving high realism in generated motions requires naturalness, smoothness,\nand plausibility. Despite rapid advancements in the field, current generation\nmethods often fall short of these goals. Furthermore, existing evaluation\nmetrics typically rely on ground-truth-based errors, simple heuristics, or\ndistribution distances, which do not align well with human perceptions of\nmotion quality. In this work, we propose a data-driven approach to bridge this\ngap by introducing a large-scale human perceptual evaluation dataset,\nMotionPercept, and a human motion critic model, MotionCritic, that capture\nhuman perceptual preferences. Our critic model offers a more accurate metric\nfor assessing motion quality and could be readily integrated into the motion\ngeneration pipeline to enhance generation quality. Extensive experiments\ndemonstrate the effectiveness of our approach in both evaluating and improving\nthe quality of generated human motions by aligning with human perceptions. Code\nand data are publicly available at https://motioncritic.github.io/.\n","authors":["Haoru Wang","Wentao Zhu","Luyi Miao","Yishu Xu","Feng Gao","Qi Tian","Yizhou Wang"],"pdf_url":"https://arxiv.org/pdf/2407.02272v2.pdf","comment":"Project page: https://motioncritic.github.io/"},{"id":"http://arxiv.org/abs/2501.13718v1","updated":"2025-01-23T14:46:38Z","published":"2025-01-23T14:46:38Z","title":"A Mutual Information Perspective on Multiple Latent Variable Generative\n Models for Positive View Generation","summary":" In image generation, Multiple Latent Variable Generative Models (MLVGMs)\nemploy multiple latent variables to gradually shape the final images, from\nglobal characteristics to finer and local details (e.g., StyleGAN, NVAE),\nemerging as powerful tools for diverse applications. Yet their generative\ndynamics and latent variable utilization remain only empirically observed. In\nthis work, we propose a novel framework to systematically quantify the impact\nof each latent variable in MLVGMs, using Mutual Information (MI) as a guiding\nmetric. Our analysis reveals underutilized variables and can guide the use of\nMLVGMs in downstream applications.\n With this foundation, we introduce a method for generating synthetic data for\nSelf-Supervised Contrastive Representation Learning (SSCRL). By leveraging the\nhierarchical and disentangled variables of MLVGMs, and guided by the previous\nanalysis, we apply tailored latent perturbations to produce diverse views for\nSSCRL, without relying on real data altogether.\n Additionally, we introduce a Continuous Sampling (CS) strategy, where the\ngenerator dynamically creates new samples during SSCRL training, greatly\nincreasing data variability. Our comprehensive experiments demonstrate the\neffectiveness of these contributions, showing that MLVGMs' generated views\ncompete on par with or even surpass views generated from real data.\n This work establishes a principled approach to understanding and exploiting\nMLVGMs, advancing both generative modeling and self-supervised learning.\n","authors":["Dario Serez","Marco Cristani","Alessio Del Bue","Vittorio Murino","Pietro Morerio"],"pdf_url":"https://arxiv.org/pdf/2501.13718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13713v1","updated":"2025-01-23T14:43:53Z","published":"2025-01-23T14:43:53Z","title":"Skin Disease Detection and Classification of Actinic Keratosis and\n Psoriasis Utilizing Deep Transfer Learning","summary":" Skin diseases can arise from infections, allergies, genetic factors,\nautoimmune disorders, hormonal imbalances, or environmental triggers such as\nsun damage and pollution. Some skin diseases, such as Actinic Keratosis and\nPsoriasis, can be fatal if not treated in time. Early identification is\ncrucial, but the diagnostic methods for these conditions are often expensive\nand not widely accessible. In this study, we propose a novel and efficient\nmethod for diagnosing skin diseases using deep learning techniques. This\napproach employs a modified VGG16 Convolutional Neural Network (CNN) model. The\nmodel includes several convolutional layers and utilizes ImageNet weights with\nmodified top layers. The top layer is updated with fully connected layers and a\nfinal softmax activation layer to classify skin diseases. The dataset used,\ntitled \"Skin Disease Dataset,\" is publicly available. While the VGG16\narchitecture does not include data augmentation by default, preprocessing\ntechniques such as rotation, shifting, and zooming were applied to augment the\ndata prior to model training. The proposed methodology achieved 90.67% accuracy\nusing the modified VGG16 model, demonstrating its reliability in classifying\nskin diseases. The promising results highlight the potential of this approach\nfor real-world applications.\n","authors":["Fahud Ahmmed","Md. Zaheer Raihan","Kamnur Nahar","D. M. Asadujjaman","Md. Mahfujur Rahman","Abdullah Tamim"],"pdf_url":"https://arxiv.org/pdf/2501.13713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13106v2","updated":"2025-01-23T14:41:06Z","published":"2025-01-22T18:59:46Z","title":"VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video\n Understanding","summary":" In this paper, we propose VideoLLaMA3, a more advanced multimodal foundation\nmodel for image and video understanding. The core design philosophy of\nVideoLLaMA3 is vision-centric. The meaning of \"vision-centric\" is two-fold: the\nvision-centric training paradigm and vision-centric framework design. The key\ninsight of our vision-centric training paradigm is that high-quality image-text\ndata is crucial for both image and video understanding. Instead of preparing\nmassive video-text datasets, we focus on constructing large-scale and\nhigh-quality image-text datasets. VideoLLaMA3 has four training stages: 1)\nVision Encoder Adaptation, which enables vision encoder to accept images of\nvariable resolutions as input; 2) Vision-Language Alignment, which jointly\ntunes the vision encoder, projector, and LLM with large-scale image-text data\ncovering multiple types (including scene images, documents, charts) as well as\ntext-only data. 3) Multi-task Fine-tuning, which incorporates image-text SFT\ndata for downstream tasks and video-text data to establish a foundation for\nvideo understanding. 4) Video-centric Fine-tuning, which further improves the\nmodel's capability in video understanding. As for the framework design, to\nbetter capture fine-grained details in images, the pretrained vision encoder is\nadapted to encode images of varying sizes into vision tokens with corresponding\nnumbers, rather than a fixed number of tokens. For video inputs, we reduce the\nnumber of vision tokens according to their similarity so that the\nrepresentation of videos will be more precise and compact. Benefit from\nvision-centric designs, VideoLLaMA3 achieves compelling performances in both\nimage and video understanding benchmarks.\n","authors":["Boqiang Zhang","Kehan Li","Zesen Cheng","Zhiqiang Hu","Yuqian Yuan","Guanzheng Chen","Sicong Leng","Yuming Jiang","Hang Zhang","Xin Li","Peng Jin","Wenqi Zhang","Fan Wang","Lidong Bing","Deli Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.13106v2.pdf","comment":"BZ, KL, ZC, ZH, YY, GC, SL, YJ, HZ, and XL contributed equally to\n this project. Code: https://github.com/DAMO-NLP-SG/VideoLLaMA3"},{"id":"http://arxiv.org/abs/2501.13710v1","updated":"2025-01-23T14:38:40Z","published":"2025-01-23T14:38:40Z","title":"YOLO11-JDE: Fast and Accurate Multi-Object Tracking with Self-Supervised\n Re-ID","summary":" We introduce YOLO11-JDE, a fast and accurate multi-object tracking (MOT)\nsolution that combines real-time object detection with self-supervised\nRe-Identification (Re-ID). By incorporating a dedicated Re-ID branch into\nYOLO11s, our model performs Joint Detection and Embedding (JDE), generating\nappearance features for each detection. The Re-ID branch is trained in a fully\nself-supervised setting while simultaneously training for detection,\neliminating the need for costly identity-labeled datasets. The triplet loss,\nwith hard positive and semi-hard negative mining strategies, is used for\nlearning discriminative embeddings. Data association is enhanced with a custom\ntracking implementation that successfully integrates motion, appearance, and\nlocation cues. YOLO11-JDE achieves competitive results on MOT17 and MOT20\nbenchmarks, surpassing existing JDE methods in terms of FPS and using up to ten\ntimes fewer parameters. Thus, making our method a highly attractive solution\nfor real-world applications.\n","authors":["Iñaki Erregue","Kamal Nasrollahi","Sergio Escalera"],"pdf_url":"https://arxiv.org/pdf/2501.13710v1.pdf","comment":"This paper has been accepted to the 5th Workshop on Real-World\n Surveillance: Applications and Challenges (WACV 2025)"},{"id":"http://arxiv.org/abs/2501.13709v1","updated":"2025-01-23T14:38:05Z","published":"2025-01-23T14:38:05Z","title":"Regularizing cross entropy loss via minimum entropy and K-L divergence","summary":" I introduce two novel loss functions for classification in deep learning. The\ntwo loss functions extend standard cross entropy loss by regularizing it with\nminimum entropy and Kullback-Leibler (K-L) divergence terms. The first of the\ntwo novel loss functions is termed mixed entropy loss (MIX-ENT for short),\nwhile the second one is termed minimum entropy regularized cross-entropy loss\n(MIN-ENT for short). The MIX-ENT function introduces a regularizer that can be\nshown to be equivalent to the sum of a minimum entropy term and a K-L\ndivergence term. However, it should be noted that the K-L divergence term here\nis different from that in the standard cross-entropy loss function, in the\nsense that it swaps the roles of the target probability and the hypothesis\nprobability. The MIN-ENT function simply adds a minimum entropy regularizer to\nthe standard cross entropy loss function. In both MIX-ENT and MIN-ENT, the\nminimum entropy regularizer minimizes the entropy of the hypothesis probability\ndistribution which is output by the neural network. Experiments on the\nEMNIST-Letters dataset shows that my implementation of MIX-ENT and MIN-ENT lets\nthe VGG model climb from its previous 3rd position on the paperswithcode\nleaderboard to reach the 2nd position on the leaderboard, outperforming the\nSpinal-VGG model in so doing. Specifically, using standard cross-entropy, VGG\nachieves 95.86% while Spinal-VGG achieves 95.88% classification accuracies,\nwhereas using VGG (without Spinal-VGG) our MIN-ENT achieved 95.933%, while our\nMIX-ENT achieved 95.927% accuracies. The pre-trained models for both MIX-ENT\nand MIN-ENT are at https://github.com/rahmanoladi/minimum entropy project.\n","authors":["Abdulrahman Oladipupo Ibraheem"],"pdf_url":"https://arxiv.org/pdf/2501.13709v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2501.13707v1","updated":"2025-01-23T14:37:21Z","published":"2025-01-23T14:37:21Z","title":"EventVL: Understand Event Streams via Multimodal Large Language Model","summary":" The event-based Vision-Language Model (VLM) recently has made good progress\nfor practical vision tasks. However, most of these works just utilize CLIP for\nfocusing on traditional perception tasks, which obstruct model understanding\nexplicitly the sufficient semantics and context from event streams. To address\nthe deficiency, we propose EventVL, the first generative event-based MLLM\n(Multimodal Large Language Model) framework for explicit semantic\nunderstanding. Specifically, to bridge the data gap for connecting different\nmodalities semantics, we first annotate a large event-image/video-text dataset,\ncontaining almost 1.4 million high-quality pairs of data, which enables\neffective learning across various scenes, e.g., drive scene or human motion.\nAfter that, we design Event Spatiotemporal Representation to fully explore the\ncomprehensive information by diversely aggregating and segmenting the event\nstream. To further promote a compact semantic space, Dynamic Semantic Alignment\nis introduced to improve and complete sparse semantic spaces of events.\nExtensive experiments show that our EventVL can significantly surpass existing\nMLLM baselines in event captioning and scene description generation tasks. We\nhope our research could contribute to the development of the event vision\ncommunity.\n","authors":["Pengteng Li","Yunfan Lu","Pinghao Song","Wuyang Li","Huizai Yao","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2501.13707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09046v2","updated":"2025-01-23T14:18:07Z","published":"2025-01-15T09:52:40Z","title":"Learning Hemodynamic Scalar Fields on Coronary Artery Meshes: A\n Benchmark of Geometric Deep Learning Models","summary":" Coronary artery disease, caused by the narrowing of coronary vessels due to\natherosclerosis, is the leading cause of death worldwide. The diagnostic gold\nstandard, fractional flow reserve (FFR), measures the trans-stenotic pressure\nratio during maximal vasodilation but is invasive and costly. This has driven\nthe development of virtual FFR (vFFR) using computational fluid dynamics (CFD)\nto simulate coronary flow. Geometric deep learning algorithms have shown\npromise for learning features on meshes, including cardiovascular research\napplications. This study empirically analyzes various backends for predicting\nvFFR fields in coronary arteries as CFD surrogates, comparing six backends for\nlearning hemodynamics on meshes using CFD solutions as ground truth.\n The study has two parts: i) Using 1,500 synthetic left coronary artery\nbifurcations, models were trained to predict pressure-related fields for vFFR\nreconstruction, comparing different learning variables. ii) Using 427\npatient-specific CFD simulations, experiments were repeated focusing on the\nbest-performing learning variable from the synthetic dataset.\n Most backends performed well on the synthetic dataset, especially when\npredicting pressure drop over the manifold. Transformer-based backends\noutperformed others when predicting pressure and vFFR fields and were the only\nmodels achieving strong performance on patient-specific data, excelling in both\naverage per-point error and vFFR accuracy in stenotic lesions.\n These results suggest geometric deep learning backends can effectively\nreplace CFD for simple geometries, while transformer-based networks are\nsuperior for complex, heterogeneous datasets. Pressure drop was identified as\nthe optimal network output for learning pressure-related fields.\n","authors":["Guido Nannini","Julian Suk","Patryk Rygiel","Simone Saitta","Luca Mariani","Riccardo Maragna","Andrea Baggiano","Gianluca Pontone","Jelmer M. Wolterink","Alberto Redaelli"],"pdf_url":"https://arxiv.org/pdf/2501.09046v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13692v1","updated":"2025-01-23T14:17:01Z","published":"2025-01-23T14:17:01Z","title":"Training-Free Consistency Pipeline for Fashion Repose","summary":" Recent advancements in diffusion models have significantly broadened the\npossibilities for editing images of real-world objects. However, performing\nnon-rigid transformations, such as changing the pose of objects or image-based\nconditioning, remains challenging. Maintaining object identity during these\nedits is difficult, and current methods often fall short of the precision\nneeded for industrial applications, where consistency is critical.\nAdditionally, fine-tuning diffusion models requires custom training data, which\nis not always accessible in real-world scenarios. This work introduces\nFashionRepose, a training-free pipeline for non-rigid pose editing specifically\ndesigned for the fashion industry. The approach integrates off-the-shelf models\nto adjust poses of long-sleeve garments, maintaining identity and branding\nattributes. FashionRepose uses a zero-shot approach to perform these edits in\nnear real-time, eliminating the need for specialized training. consistent image\nediting. The solution holds potential for applications in the fashion industry\nand other fields demanding identity preservation in image editing.\n","authors":["Potito Aghilar","Vito Walter Anelli","Michelantonio Trizio","Tommaso Di Noia"],"pdf_url":"https://arxiv.org/pdf/2501.13692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13690v1","updated":"2025-01-23T14:15:54Z","published":"2025-01-23T14:15:54Z","title":"Variational U-Net with Local Alignment for Joint Tumor Extraction and\n Registration (VALOR-Net) of Breast MRI Data Acquired at Two Different Field\n Strengths","summary":" Background: Multiparametric breast MRI data might improve tumor diagnostics,\ncharacterization, and treatment planning. Accurate alignment and delineation of\nimages acquired at different field strengths such as 3T and 7T, remain\nchallenging research tasks. Purpose: To address alignment challenges and enable\nconsistent tumor segmentation across different MRI field strengths. Study type:\nRetrospective. Subjects: Nine female subjects with breast tumors were involved:\nsix histologically proven invasive ductal carcinomas (IDC) and three\nfibroadenomas. Field strength/sequence: Imaging was performed at 3T and 7T\nscanners using post-contrast T1-weighted three-dimensional time-resolved\nangiography with stochastic trajectories (TWIST) sequence. Assessments: The\nmethod's performance for joint image registration and tumor segmentation was\nevaluated using several quantitative metrics, including signal-to-noise ratio\n(PSNR), structural similarity index (SSIM), normalized cross-correlation (NCC),\nDice coefficient, F1 score, and relative sum of squared differences (rel SSD).\nStatistical tests: The Pearson correlation coefficient was used to test the\nrelationship between the registration and segmentation metrics. Results: When\ncalculated for each subject individually, the PSNR was in a range from 27.5 to\n34.5 dB, and the SSIM was from 82.6 to 92.8%. The model achieved an NCC from\n96.4 to 99.3% and a Dice coefficient of 62.9 to 95.3%. The F1 score was between\n55.4 and 93.2% and the rel SSD was in the range of 2.0 and 7.5%. The\nsegmentation metrics Dice and F1 Score are highly correlated (0.995), while a\nmoderate correlation between NCC and SSIM (0.681) was found for registration.\nData conclusion: Initial results demonstrate that the proposed method may be\nfeasible in providing joint tumor segmentation and registration of MRI data\nacquired at different field strengths.\n","authors":["Muhammad Shahkar Khan","Haider Ali","Laura Villazan Garcia","Noor Badshah","Siegfried Trattnig","Florian Schwarzhans","Ramona Woitek","Olgica Zaric"],"pdf_url":"https://arxiv.org/pdf/2501.13690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09720v2","updated":"2025-01-23T14:11:52Z","published":"2025-01-16T18:09:22Z","title":"A Simple Aerial Detection Baseline of Multimodal Language Models","summary":" The multimodal language models (MLMs) based on generative pre-trained\nTransformer are considered powerful candidates for unifying various domains and\ntasks. MLMs developed for remote sensing (RS) have demonstrated outstanding\nperformance in multiple tasks, such as visual question answering and visual\ngrounding. In addition to visual grounding that detects specific objects\ncorresponded to given instruction, aerial detection, which detects all objects\nof multiple categories, is also a valuable and challenging task for RS\nfoundation models. However, aerial detection has not been explored by existing\nRS MLMs because the autoregressive prediction mechanism of MLMs differs\nsignificantly from the detection outputs. In this paper, we present a simple\nbaseline for applying MLMs to aerial detection for the first time, named\nLMMRotate. Specifically, we first introduce a normalization method to transform\ndetection outputs into textual outputs to be compatible with the MLM framework.\nThen, we propose a evaluation method, which ensures a fair comparison between\nMLMs and conventional object detection models. We construct the baseline by\nfine-tuning open-source general-purpose MLMs and achieve impressive detection\nperformance comparable to conventional detector. We hope that this baseline\nwill serve as a reference for future MLM development, enabling more\ncomprehensive capabilities for understanding RS images. Code is available at\nhttps://github.com/Li-Qingyun/mllm-mmrotate.\n","authors":["Qingyun Li","Yushi Chen","Xinya Shu","Dong Chen","Xin He","Yi Yu","Xue Yang"],"pdf_url":"https://arxiv.org/pdf/2501.09720v2.pdf","comment":"4 pages, 1 table, 4 figures"},{"id":"http://arxiv.org/abs/2501.13667v1","updated":"2025-01-23T13:53:33Z","published":"2025-01-23T13:53:33Z","title":"MPG-SAM 2: Adapting SAM 2 with Mask Priors and Global Context for\n Referring Video Object Segmentation","summary":" Referring video object segmentation (RVOS) aims to segment objects in a video\naccording to textual descriptions, which requires the integration of multimodal\ninformation and temporal dynamics perception. The Segment Anything Model 2 (SAM\n2) has shown great effectiveness across various video segmentation tasks.\nHowever, its application to offline RVOS is challenged by the translation of\nthe text into effective prompts and a lack of global context awareness. In this\npaper, we propose a novel RVOS framework, termed MPG-SAM 2, to address these\nchallenges. Specifically, MPG-SAM 2 employs a unified multimodal encoder to\njointly encode video and textual features, generating semantically aligned\nvideo and text embeddings, along with multimodal class tokens. A mask prior\ngenerator utilizes the video embeddings and class tokens to create pseudo masks\nof target objects and global context. These masks are fed into the prompt\nencoder as dense prompts along with multimodal class tokens as sparse prompts\nto generate accurate prompts for SAM 2. To provide the online SAM 2 with a\nglobal view, we introduce a hierarchical global-historical aggregator, which\nallows SAM 2 to aggregate global and historical information of target objects\nat both pixel and object levels, enhancing the target representation and\ntemporal consistency. Extensive experiments on several RVOS benchmarks\ndemonstrate the superiority of MPG-SAM 2 and the effectiveness of our proposed\nmodules.\n","authors":["Fu Rong","Meng Lan","Qian Zhang","Lefei Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.13667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.11007v2","updated":"2025-01-23T13:48:05Z","published":"2025-01-19T10:47:49Z","title":"HFGCN:Hypergraph Fusion Graph Convolutional Networks for Skeleton-Based\n Action Recognition","summary":" In recent years, action recognition has received much attention and wide\napplication due to its important role in video understanding. Most of the\nresearches on action recognition methods focused on improving the performance\nvia various deep learning methods rather than the classification of skeleton\npoints. The topological modeling between skeleton points and body parts was\nseldom considered. Although some studies have used a data-driven approach to\nclassify the topology of the skeleton point, the nature of the skeleton point\nin terms of kinematics has not been taken into consideration. Therefore, in\nthis paper, we draw on the theory of kinematics to adapt the topological\nrelations of the skeleton point and propose a topological relation\nclassification based on body parts and distance from core of body. To\nsynthesize these topological relations for action recognition, we propose a\nnovel Hypergraph Fusion Graph Convolutional Network (HFGCN). In particular, the\nproposed model is able to focus on the human skeleton points and the different\nbody parts simultaneously, and thus construct the topology, which improves the\nrecognition accuracy obviously. We use a hypergraph to represent the\ncategorical relationships of these skeleton points and incorporate the\nhypergraph into a graph convolution network to model the higher-order\nrelationships among the skeleton points and enhance the feature representation\nof the network. In addition, our proposed hypergraph attention module and\nhypergraph graph convolution module optimize topology modeling in temporal and\nchannel dimensions, respectively, to further enhance the feature representation\nof the network. We conducted extensive experiments on three widely used\ndatasets.The results validate that our proposed method can achieve the best\nperformance when compared with the state-of-the-art skeleton-based methods.\n","authors":["Pengcheng Dong","Wenbo Wan","Huaxiang Zhang","Jiande Sun"],"pdf_url":"https://arxiv.org/pdf/2501.11007v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13643v1","updated":"2025-01-23T13:21:14Z","published":"2025-01-23T13:21:14Z","title":"Enhancing Medical Image Analysis through Geometric and Photometric\n transformations","summary":" Medical image analysis suffers from a lack of labeled data due to several\nchallenges including patient privacy and lack of experts. Although some AI\nmodels only perform well with large amounts of data, we will move to data\naugmentation where there is a solution to improve the performance of our models\nand increase the dataset size through traditional or advanced techniques. In\nthis paper, we evaluate the effectiveness of data augmentation techniques on\ntwo different medical image datasets. In the first step, we applied some\ntransformation techniques to the skin cancer dataset containing benign and\nmalignant classes. Then, we trained the convolutional neural network (CNN) on\nthe dataset before and after augmentation, which significantly improved test\naccuracy from 90.74% to 96.88% and decreased test loss from 0.7921 to 0.1468\nafter augmentation. In the second step, we used the Mixup technique by mixing\ntwo random images and their corresponding masks using the retina and blood\nvessels dataset, then we trained the U-net model and obtained the Dice\ncoefficient which increased from 0 before augmentation to 0.4163 after\naugmentation. The result shows the effect of using data augmentation to\nincrease the dataset size on the classification and segmentation performance.\n","authors":["Khadija Rais","Mohamed Amroune","Mohamed Yassine Haouam"],"pdf_url":"https://arxiv.org/pdf/2501.13643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13624v1","updated":"2025-01-23T12:45:20Z","published":"2025-01-23T12:45:20Z","title":"QMamba: Post-Training Quantization for Vision State Space Models","summary":" State Space Models (SSMs), as key components of Mamaba, have gained\nincreasing attention for vision models recently, thanks to their efficient long\nsequence modeling capability. Given the computational cost of deploying SSMs on\nresource-limited edge devices, Post-Training Quantization (PTQ) is a technique\nwith the potential for efficient deployment of SSMs. In this work, we propose\nQMamba, one of the first PTQ frameworks to our knowledge, designed for vision\nSSMs based on the analysis of the activation distributions in SSMs. We reveal\nthat the distribution of discrete parameters exhibits long-tailed skewness and\nthe distribution of the hidden state sequence exhibits highly dynamic\nvariations. Correspondingly, we design Long-tailed Skewness Quantization (LtSQ)\nto quantize discrete parameters and Temporal Group Quantization (TGQ) to\nquantize hidden states, which reduces the quantization errors. Extensive\nexperiments demonstrate that QMamba outperforms advanced PTQ methods on vision\nmodels across multiple model sizes and architectures. Notably, QMamba surpasses\nexisting methods by 21.0% on ImageNet classification with 4-bit activations.\n","authors":["Yinglong Li","Xiaoyu Liu","Jiacheng Li","Ruikang Xu","Yinda Chen","Zhiwei Xiong"],"pdf_url":"https://arxiv.org/pdf/2501.13624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13620v1","updated":"2025-01-23T12:42:42Z","published":"2025-01-23T12:42:42Z","title":"Cognitive Paradigms for Evaluating VLMs on Visual Reasoning Task","summary":" Evaluating the reasoning capabilities of Vision-Language Models (VLMs) in\ncomplex visual tasks provides valuable insights into their potential and\nlimitations. In this work, we assess the performance of VLMs on the challenging\nBongard Openworld Problems benchmark, which involves reasoning over natural\nimages. We propose and evaluate three human-inspired paradigms: holistic\nanalysis (global context processing), deductive rule learning (explicit rule\nderivation and application), and componential analysis (structured\ndecomposition of images into components). Our results demonstrate that\nstate-of-the-art models, including GPT-4o and Gemini, not only surpass human\nbenchmarks but also excel in structured reasoning tasks, with componential\nanalysis proving especially effective. However, ablation studies reveal key\nchallenges, such as handling synthetic images, making fine-grained\ndistinctions, and interpreting nuanced contextual information. These insights\nunderscore the need for further advancements in model robustness and\ngeneralization, while highlighting the transformative potential of structured\nreasoning approaches in enhancing VLM capabilities.\n","authors":["Mohit Vaishnav","Tanel Tammet"],"pdf_url":"https://arxiv.org/pdf/2501.13620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16291v2","updated":"2025-01-23T11:29:02Z","published":"2024-02-26T04:18:42Z","title":"SaRPFF: A Self-Attention with Register-based Pyramid Feature Fusion\n module for enhanced RLD detection","summary":" Detecting objects across varying scales is still a challenge in computer\nvision, particularly in agricultural applications like Rice Leaf Disease (RLD)\ndetection, where objects exhibit significant scale variations (SV).\nConventional object detection (OD) like Faster R-CNN, SSD, and YOLO methods\noften fail to effectively address SV, leading to reduced accuracy and missed\ndetections. To tackle this, we propose SaRPFF (Self-Attention with\nRegister-based Pyramid Feature Fusion), a novel module designed to enhance\nmulti-scale object detection. SaRPFF integrates 2D-Multi-Head Self-Attention\n(MHSA) with Register tokens, improving feature interpretability by mitigating\nartifacts within MHSA. Additionally, it integrates efficient attention atrous\nconvolutions into the pyramid feature fusion and introduce a deconvolutional\nlayer for refined up-sampling. We evaluate SaRPFF on YOLOv7 using the MRLD and\nCOCO datasets. Our approach demonstrates a +2.61% improvement in Average\nPrecision (AP) on the MRLD dataset compared to the baseline FPN method in\nYOLOv7. Furthermore, SaRPFF outperforms other FPN variants, including BiFPN,\nNAS-FPN, and PANET, showcasing its versatility and potential to advance OD\ntechniques. This study highlights SaRPFF effectiveness in addressing SV\nchallenges and its adaptability across FPN-based OD models.\n","authors":["Yunusa Haruna","Shiyin Qin","Abdulrahman Hamman Adama Chukkol","Isah Bello","Adamu Lawan"],"pdf_url":"https://arxiv.org/pdf/2402.16291v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09600v4","updated":"2025-01-23T11:25:43Z","published":"2025-01-16T15:22:06Z","title":"Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid\n Prototyping in Virtual Reality Applications","summary":" SLAM is a foundational technique with broad applications in robotics and\nAR/VR. SLAM simulations evaluate new concepts, but testing on\nresource-constrained devices, such as VR HMDs, faces challenges: high\ncomputational cost and restricted sensor data access. This work proposes a\nsparse framework using mesh geometry projections as features, which improves\nefficiency and circumvents direct sensor data access, advancing SLAM research\nas we demonstrate in VR and through numerical evaluation.\n","authors":["Carlos Augusto Pinheiro de Sousa","Heiko Hamann","Oliver Deussen"],"pdf_url":"https://arxiv.org/pdf/2501.09600v4.pdf","comment":"Accepted to ENPT XR at IEEE VR 2025"},{"id":"http://arxiv.org/abs/2412.10734v4","updated":"2025-01-23T11:20:19Z","published":"2024-12-14T08:08:40Z","title":"OmniHD-Scenes: A Next-Generation Multimodal Dataset for Autonomous\n Driving","summary":" The rapid advancement of deep learning has intensified the need for\ncomprehensive data for use by autonomous driving algorithms. High-quality\ndatasets are crucial for the development of effective data-driven autonomous\ndriving solutions. Next-generation autonomous driving datasets must be\nmultimodal, incorporating data from advanced sensors that feature extensive\ndata coverage, detailed annotations, and diverse scene representation. To\naddress this need, we present OmniHD-Scenes, a large-scale multimodal dataset\nthat provides comprehensive omnidirectional high-definition data. The\nOmniHD-Scenes dataset combines data from 128-beam LiDAR, six cameras, and six\n4D imaging radar systems to achieve full environmental perception. The dataset\ncomprises 1501 clips, each approximately 30-s long, totaling more than 450K\nsynchronized frames and more than 5.85 million synchronized sensor data points.\nWe also propose a novel 4D annotation pipeline. To date, we have annotated 200\nclips with more than 514K precise 3D bounding boxes. These clips also include\nsemantic segmentation annotations for static scene elements. Additionally, we\nintroduce a novel automated pipeline for generation of the dense occupancy\nground truth, which effectively leverages information from non-key frames.\nAlongside the proposed dataset, we establish comprehensive evaluation metrics,\nbaseline models, and benchmarks for 3D detection and semantic occupancy\nprediction. These benchmarks utilize surround-view cameras and 4D imaging radar\nto explore cost-effective sensor solutions for autonomous driving applications.\nExtensive experiments demonstrate the effectiveness of our low-cost sensor\nconfiguration and its robustness under adverse conditions. Data will be\nreleased at https://www.2077ai.com/OmniHD-Scenes.\n","authors":["Lianqing Zheng","Long Yang","Qunshu Lin","Wenjin Ai","Minghao Liu","Shouyi Lu","Jianan Liu","Hongze Ren","Jingyue Mo","Xiaokai Bai","Jie Bai","Zhixiong Ma","Xichan Zhu"],"pdf_url":"https://arxiv.org/pdf/2412.10734v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13073v2","updated":"2025-01-23T11:19:28Z","published":"2025-01-22T18:35:57Z","title":"CHaRNet: Conditioned Heatmap Regression for Robust Dental Landmark\n Localization","summary":" Identifying anatomical landmarks in 3D dental models is crucial for\northodontic treatment. Manually placing these key points is complex,\ntime-consuming, and requires expert knowledge. While some machine learning\nmethods have been proposed for automatic tooth landmark detection in 3D\nIntraoral Scans (IOS), research remains limited, with no fully end-to-end\napproaches that avoid teeth segmentation. We propose CHaRNet (Conditioned\nHeatmap Regression Network), the first end-to-end deep learning method for\ntooth landmark detection in 3D IOS. Unlike traditional two-stage methods that\nsegment teeth before detecting landmarks, CHaRNet directly detects landmarks on\nthe input point cloud. It consists of four key modules: (1) a point cloud\nencoder, (2) a point cloud decoder with a heatmap regression head, (3) a teeth\npresence classification head, and (4) the innovative Conditioned Heatmap\nRegression (CHaR) module. The CHaR module refines landmark regression by\nleveraging teeth presence classification, enabling dynamic adaptation to cases\nwith missing teeth and improving accuracy in complex dental models. We evaluate\nCHaRNet using five point cloud learning algorithms to validate the\neffectiveness of the CHaR module and test it on a clinical dataset of 1,214\nannotated 3D dental models. Both the dataset and code will be publicly released\nto address the lack of open datasets in orthodontics, promote benchmarking, and\ninspire new research. CHaRNet achieves a Mean Euclidean Distance Error (MEDE)\nof 1.28 mm and a Mean Success Ratio (MSR) of 82.40%, demonstrating robust\nperformance. Notably, it excels in handling irregular dental geometries, such\nas models with missing teeth. This end-to-end approach streamlines orthodontic\nworkflows, improves 3D IOS analysis precision, and facilitates efficient\ncomputer-assisted treatment planning.\n","authors":["José Rodríguez-Ortega","Siham Tabik"],"pdf_url":"https://arxiv.org/pdf/2501.13073v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13563v1","updated":"2025-01-23T11:10:02Z","published":"2025-01-23T11:10:02Z","title":"Black-Box Adversarial Attack on Vision Language Models for Autonomous\n Driving","summary":" Vision-language models (VLMs) have significantly advanced autonomous driving\n(AD) by enhancing reasoning capabilities; however, these models remain highly\nsusceptible to adversarial attacks. While existing research has explored\nwhite-box attacks to some extent, the more practical and challenging black-box\nscenarios remain largely underexplored due to their inherent difficulty. In\nthis paper, we take the first step toward designing black-box adversarial\nattacks specifically targeting VLMs in AD. We identify two key challenges for\nachieving effective black-box attacks in this context: the effectiveness across\ndriving reasoning chains in AD systems and the dynamic nature of driving\nscenarios. To address this, we propose Cascading Adversarial Disruption (CAD).\nIt first introduces Decision Chain Disruption, which targets low-level\nreasoning breakdown by generating and injecting deceptive semantics, ensuring\nthe perturbations remain effective across the entire decision-making chain.\nBuilding on this, we present Risky Scene Induction, which addresses dynamic\nadaptation by leveraging a surrogate VLM to understand and construct high-level\nrisky scenarios that are likely to result in critical errors in the current\ndriving contexts. Extensive experiments conducted on multiple AD VLMs and\nbenchmarks demonstrate that CAD achieves state-of-the-art attack effectiveness,\nsignificantly outperforming existing methods (+13.43% on average). Moreover, we\nvalidate its practical applicability through real-world attacks on AD vehicles\npowered by VLMs, where the route completion rate drops by 61.11% and the\nvehicle crashes directly into the obstacle vehicle with adversarial patches.\nFinally, we release CADA dataset, comprising 18,808 adversarial\nvisual-question-answer pairs, to facilitate further evaluation and research in\nthis critical domain. Our codes and dataset will be available after paper's\nacceptance.\n","authors":["Lu Wang","Tianyuan Zhang","Yang Qu","Siyuan Liang","Yuwei Chen","Aishan Liu","Xianglong Liu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2501.13563v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13558v1","updated":"2025-01-23T11:05:45Z","published":"2025-01-23T11:05:45Z","title":"GoDe: Gaussians on Demand for Progressive Level of Detail and Scalable\n Compression","summary":" 3D Gaussian Splatting enhances real-time performance in novel view synthesis\nby representing scenes with mixtures of Gaussians and utilizing differentiable\nrasterization. However, it typically requires large storage capacity and high\nVRAM, demanding the design of effective pruning and compression techniques.\nExisting methods, while effective in some scenarios, struggle with scalability\nand fail to adapt models based on critical factors such as computing\ncapabilities or bandwidth, requiring to re-train the model under different\nconfigurations. In this work, we propose a novel, model-agnostic technique that\norganizes Gaussians into several hierarchical layers, enabling progressive\nLevel of Detail (LoD) strategy. This method, combined with recent approach of\ncompression of 3DGS, allows a single model to instantly scale across several\ncompression ratios, with minimal to none impact to quality compared to a single\nnon-scalable model and without requiring re-training. We validate our approach\non typical datasets and benchmarks, showcasing low distortion and substantial\ngains in terms of scalability and adaptability.\n","authors":["Francesco Di Sario","Riccardo Renzulli","Marco Grangetto","Akihiro Sugimoto","Enzo Tartaglione"],"pdf_url":"https://arxiv.org/pdf/2501.13558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09916v3","updated":"2025-01-23T11:03:13Z","published":"2024-08-19T11:44:40Z","title":"Attribution Analysis Meets Model Editing: Advancing Knowledge Correction\n in Vision Language Models with VisEdit","summary":" Model editing aims to correct outdated or erroneous knowledge in large models\nwithout costly retraining. Recent research discovered that the mid-layer\nrepresentation of the subject's final token in a prompt has a strong influence\non factual predictions, and developed Large Language Model (LLM) editing\ntechniques based on this observation. However, for Vision-LLMs (VLLMs), how\nvisual representations impact the predictions from a decoder-only language\nmodel remains largely unexplored. To the best of our knowledge, model editing\nfor VLLMs has not been extensively studied in the literature. In this work, we\nemploy the contribution allocation and noise perturbation methods to measure\nthe contributions of visual representations for token predictions. Our\nattribution analysis shows that visual representations in mid-to-later layers\nthat are highly relevant to the prompt contribute significantly to predictions.\nBased on these insights, we propose VisEdit, a novel model editor for VLLMs\nthat effectively corrects knowledge by editing intermediate visual\nrepresentations in regions important to the edit prompt. We evaluated VisEdit\nusing multiple VLLM backbones and public VLLM editing benchmark datasets. The\nresults show the superiority of VisEdit over the strong baselines adapted from\nexisting state-of-the-art editors for LLMs.\n","authors":["Qizhou Chen","Taolin Zhang","Chengyu Wang","Xiaofeng He","Dakan Wang","Tingting Liu"],"pdf_url":"https://arxiv.org/pdf/2408.09916v3.pdf","comment":"Accepted to AAAI-2025 as an oral presentation"},{"id":"http://arxiv.org/abs/2501.13554v1","updated":"2025-01-23T10:57:22Z","published":"2025-01-23T10:57:22Z","title":"One-Prompt-One-Story: Free-Lunch Consistent Text-to-Image Generation\n Using a Single Prompt","summary":" Text-to-image generation models can create high-quality images from input\nprompts. However, they struggle to support the consistent generation of\nidentity-preserving requirements for storytelling. Existing approaches to this\nproblem typically require extensive training in large datasets or additional\nmodifications to the original model architectures. This limits their\napplicability across different domains and diverse diffusion model\nconfigurations. In this paper, we first observe the inherent capability of\nlanguage models, coined context consistency, to comprehend identity through\ncontext with a single prompt. Drawing inspiration from the inherent context\nconsistency, we propose a novel training-free method for consistent\ntext-to-image (T2I) generation, termed \"One-Prompt-One-Story\" (1Prompt1Story).\nOur approach 1Prompt1Story concatenates all prompts into a single input for T2I\ndiffusion models, initially preserving character identities. We then refine the\ngeneration process using two novel techniques: Singular-Value Reweighting and\nIdentity-Preserving Cross-Attention, ensuring better alignment with the input\ndescription for each frame. In our experiments, we compare our method against\nvarious existing consistent T2I generation approaches to demonstrate its\neffectiveness through quantitative metrics and qualitative assessments. Code is\navailable at https://github.com/byliutao/1Prompt1Story.\n","authors":["Tao Liu","Kai Wang","Senmao Li","Joost van de Weijer","Fahad Shahbaz Khan","Shiqi Yang","Yaxing Wang","Jian Yang","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.13554v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13536v1","updated":"2025-01-23T10:35:22Z","published":"2025-01-23T10:35:22Z","title":"ReasVQA: Advancing VideoQA with Imperfect Reasoning Process","summary":" Video Question Answering (VideoQA) is a challenging task that requires\nunderstanding complex visual and temporal relationships within videos to answer\nquestions accurately. In this work, we introduce \\textbf{ReasVQA}\n(Reasoning-enhanced Video Question Answering), a novel approach that leverages\nreasoning processes generated by Multimodal Large Language Models (MLLMs) to\nimprove the performance of VideoQA models. Our approach consists of three\nphases: reasoning generation, reasoning refinement, and learning from\nreasoning. First, we generate detailed reasoning processes using additional\nMLLMs, and second refine them via a filtering step to ensure data quality.\nFinally, we use the reasoning data, which might be in an imperfect form, to\nguide the VideoQA model via multi-task learning, on how to interpret and answer\nquestions based on a given video. We evaluate ReasVQA on three popular\nbenchmarks, and our results establish new state-of-the-art performance with\nsignificant improvements of +2.9 on NExT-QA, +7.3 on STAR, and +5.9 on\nIntentQA. Our findings demonstrate the supervising benefits of integrating\nreasoning processes into VideoQA. Further studies validate each component of\nour method, also with different backbones and MLLMs, and again highlight the\nadvantages of this simple but effective method. We offer a new perspective on\nenhancing VideoQA performance by utilizing advanced reasoning techniques,\nsetting a new benchmark in this research field.\n","authors":["Jianxin Liang","Xiaojun Meng","Huishuai Zhang","Yueqian Wang","Jiansheng Wei","Dongyan Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.13536v1.pdf","comment":"Accepted to main conference at NAACL 2025; 8 pages;"},{"id":"http://arxiv.org/abs/2501.13529v1","updated":"2025-01-23T10:26:48Z","published":"2025-01-23T10:26:48Z","title":"Overcoming Support Dilution for Robust Few-shot Semantic Segmentation","summary":" Few-shot Semantic Segmentation (FSS) is a challenging task that utilizes\nlimited support images to segment associated unseen objects in query images.\nHowever, recent FSS methods are observed to perform worse, when enlarging the\nnumber of shots. As the support set enlarges, existing FSS networks struggle to\nconcentrate on the high-contributed supports and could easily be overwhelmed by\nthe low-contributed supports that could severely impair the mask predictions.\nIn this work, we study this challenging issue, called support dilution, our\ngoal is to recognize, select, preserve, and enhance those high-contributed\nsupports in the raw support pool. Technically, our method contains three novel\nparts. First, we propose a contribution index, to quantitatively estimate if a\nhigh-contributed support dilutes. Second, we develop the Symmetric Correlation\n(SC) module to preserve and enhance the high-contributed support features,\nminimizing the distraction by the low-contributed features. Third, we design\nthe Support Image Pruning operation, to retrieve a compact and high quality\nsubset by discarding low-contributed supports. We conduct extensive experiments\non two FSS benchmarks, COCO-20i and PASCAL-5i, the segmentation results\ndemonstrate the compelling performance of our solution over state-of-the-art\nFSS approaches. Besides, we apply our solution for online segmentation and\nreal-world segmentation, convincing segmentation results showing the practical\nability of our work for real-world demonstrations.\n","authors":["Wailing Tang","Biqi Yang","Pheng-Ann Heng","Yun-Hui Liu","Chi-Wing Fu"],"pdf_url":"https://arxiv.org/pdf/2501.13529v1.pdf","comment":"15 pages, 15 figures"},{"id":"http://arxiv.org/abs/2501.13528v1","updated":"2025-01-23T10:23:04Z","published":"2025-01-23T10:23:04Z","title":"Diffusion-based Perceptual Neural Video Compression with Temporal\n Diffusion Information Reuse","summary":" Recently, foundational diffusion models have attracted considerable attention\nin image compression tasks, whereas their application to video compression\nremains largely unexplored. In this article, we introduce DiffVC, a\ndiffusion-based perceptual neural video compression framework that effectively\nintegrates foundational diffusion model with the video conditional coding\nparadigm. This framework uses temporal context from previously decoded frame\nand the reconstructed latent representation of the current frame to guide the\ndiffusion model in generating high-quality results. To accelerate the iterative\ninference process of diffusion model, we propose the Temporal Diffusion\nInformation Reuse (TDIR) strategy, which significantly enhances inference\nefficiency with minimal performance loss by reusing the diffusion information\nfrom previous frames. Additionally, to address the challenges posed by\ndistortion differences across various bitrates, we propose the Quantization\nParameter-based Prompting (QPP) mechanism, which utilizes quantization\nparameters as prompts fed into the foundational diffusion model to explicitly\nmodulate intermediate features, thereby enabling a robust variable bitrate\ndiffusion-based neural compression framework. Experimental results demonstrate\nthat our proposed solution delivers excellent performance in both perception\nmetrics and visual quality.\n","authors":["Wenzhuo Ma","Zhenzhong Chen"],"pdf_url":"https://arxiv.org/pdf/2501.13528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00740v2","updated":"2025-01-23T10:22:58Z","published":"2025-01-01T06:07:02Z","title":"RORem: Training a Robust Object Remover with Human-in-the-Loop","summary":" Despite the significant advancements, existing object removal methods\nstruggle with incomplete removal, incorrect content synthesis and blurry\nsynthesized regions, resulting in low success rates. Such issues are mainly\ncaused by the lack of high-quality paired training data, as well as the\nself-supervised training paradigm adopted in these methods, which forces the\nmodel to in-paint the masked regions, leading to ambiguity between synthesizing\nthe masked objects and restoring the background. To address these issues, we\npropose a semi-supervised learning strategy with human-in-the-loop to create\nhigh-quality paired training data, aiming to train a Robust Object Remover\n(RORem). We first collect 60K training pairs from open-source datasets to train\nan initial object removal model for generating removal samples, and then\nutilize human feedback to select a set of high-quality object removal pairs,\nwith which we train a discriminator to automate the following training data\ngeneration process. By iterating this process for several rounds, we finally\nobtain a substantial object removal dataset with over 200K pairs. Fine-tuning\nthe pre-trained stable diffusion model with this dataset, we obtain our RORem,\nwhich demonstrates state-of-the-art object removal performance in terms of both\nreliability and image quality. Particularly, RORem improves the object removal\nsuccess rate over previous methods by more than 18\\%. The dataset, source code\nand trained model are available at https://github.com/leeruibin/RORem.\n","authors":["Ruibin Li","Tao Yang","Song Guo","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.00740v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10148v2","updated":"2025-01-23T10:22:02Z","published":"2024-01-18T17:22:11Z","title":"Explicitly Disentangled Representations in Object-Centric Learning","summary":" Extracting structured representations from raw visual data is an important\nand long-standing challenge in machine learning. Recently, techniques for\nunsupervised learning of object-centric representations have raised growing\ninterest. In this context, enhancing the robustness of the latent features can\nimprove the efficiency and effectiveness of the training of downstream tasks. A\npromising step in this direction is to disentangle the factors that cause\nvariation in the data. Previously, Invariant Slot Attention disentangled\nposition, scale, and orientation from the remaining features. Extending this\napproach, we focus on separating the shape and texture components. In\nparticular, we propose a novel architecture that biases object-centric models\ntoward disentangling shape and texture components into two non-overlapping\nsubsets of the latent space dimensions. These subsets are known a priori, hence\nbefore the training process. Experiments on a range of object-centric\nbenchmarks reveal that our approach achieves the desired disentanglement while\nalso numerically improving baseline performance in most cases. In addition, we\nshow that our method can generate novel textures for a specific object or\ntransfer textures between objects with distinct shapes.\n","authors":["Riccardo Majellaro","Jonathan Collu","Aske Plaat","Thomas M. Moerland"],"pdf_url":"https://arxiv.org/pdf/2401.10148v2.pdf","comment":"Published in TMLR"},{"id":"http://arxiv.org/abs/2501.13518v1","updated":"2025-01-23T10:06:52Z","published":"2025-01-23T10:06:52Z","title":"Text-driven Online Action Detection","summary":" Detecting actions as they occur is essential for applications like video\nsurveillance, autonomous driving, and human-robot interaction. Known as online\naction detection, this task requires classifying actions in streaming videos,\nhandling background noise, and coping with incomplete actions. Transformer\narchitectures are the current state-of-the-art, yet the potential of recent\nadvancements in computer vision, particularly vision-language models (VLMs),\nremains largely untapped for this problem, partly due to high computational\ncosts. In this paper, we introduce TOAD: a Text-driven Online Action Detection\narchitecture that supports zero-shot and few-shot learning. TOAD leverages CLIP\n(Contrastive Language-Image Pretraining) textual embeddings, enabling efficient\nuse of VLMs without significant computational overhead. Our model achieves\n82.46% mAP on the THUMOS14 dataset, outperforming existing methods, and sets\nnew baselines for zero-shot and few-shot performance on the THUMOS14 and\nTVSeries datasets.\n","authors":["Manuel Benavent-Lledo","David Mulero-Pérez","David Ortiz-Perez","Jose Garcia-Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2501.13518v1.pdf","comment":"Published in Integrated Computer-Aided Engineering"},{"id":"http://arxiv.org/abs/2501.13517v1","updated":"2025-01-23T10:05:25Z","published":"2025-01-23T10:05:25Z","title":"Propensity-driven Uncertainty Learning for Sample Exploration in\n Source-Free Active Domain Adaptation","summary":" Source-free active domain adaptation (SFADA) addresses the challenge of\nadapting a pre-trained model to new domains without access to source data while\nminimizing the need for target domain annotations. This scenario is\nparticularly relevant in real-world applications where data privacy, storage\nlimitations, or labeling costs are significant concerns. Key challenges in\nSFADA include selecting the most informative samples from the target domain for\nlabeling, effectively leveraging both labeled and unlabeled target data, and\nadapting the model without relying on source domain information. Additionally,\nexisting methods often struggle with noisy or outlier samples and may require\nimpractical progressive labeling during training. To effectively select more\ninformative samples without frequently requesting human annotations, we propose\nthe Propensity-driven Uncertainty Learning (ProULearn) framework. ProULearn\nutilizes a novel homogeneity propensity estimation mechanism combined with\ncorrelation index calculation to evaluate feature-level relationships. This\napproach enables the identification of representative and challenging samples\nwhile avoiding noisy outliers. Additionally, we develop a central correlation\nloss to refine pseudo-labels and create compact class distributions during\nadaptation. In this way, ProULearn effectively bridges the domain gap and\nmaximizes adaptation performance. The principles of informative sample\nselection underlying ProULearn have broad implications beyond SFADA, offering\nbenefits across various deep learning tasks where identifying key data points\nor features is crucial. Extensive experiments on four benchmark datasets\ndemonstrate that ProULearn outperforms state-of-the-art methods in domain\nadaptation scenarios.\n","authors":["Zicheng Pan","Xiaohan Yu","Weichuan Zhang","Yongsheng Gao"],"pdf_url":"https://arxiv.org/pdf/2501.13517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13514v1","updated":"2025-01-23T10:01:33Z","published":"2025-01-23T10:01:33Z","title":"Self-Supervised Diffusion MRI Denoising via Iterative and Stable\n Refinement","summary":" Magnetic Resonance Imaging (MRI), including diffusion MRI (dMRI), serves as a\n``microscope'' for anatomical structures and routinely mitigates the influence\nof low signal-to-noise ratio scans by compromising temporal or spatial\nresolution. However, these compromises fail to meet clinical demands for both\nefficiency and precision. Consequently, denoising is a vital preprocessing\nstep, particularly for dMRI, where clean data is unavailable. In this paper, we\nintroduce Di-Fusion, a fully self-supervised denoising method that leverages\nthe latter diffusion steps and an adaptive sampling process. Unlike previous\napproaches, our single-stage framework achieves efficient and stable training\nwithout extra noise model training and offers adaptive and controllable results\nin the sampling process. Our thorough experiments on real and simulated data\ndemonstrate that Di-Fusion achieves state-of-the-art performance in\nmicrostructure modeling, tractography tracking, and other downstream tasks.\n","authors":["Chenxu Wu","Qingpeng Kong","Zihang Jiang","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.13514v1.pdf","comment":"39pages, 34figures"},{"id":"http://arxiv.org/abs/2411.02293v5","updated":"2025-01-23T09:51:37Z","published":"2024-11-04T17:21:42Z","title":"Hunyuan3D 1.0: A Unified Framework for Text-to-3D and Image-to-3D\n Generation","summary":" While 3D generative models have greatly improved artists' workflows, the\nexisting diffusion models for 3D generation suffer from slow generation and\npoor generalization. To address this issue, we propose a two-stage approach\nnamed Hunyuan3D 1.0 including a lite version and a standard version, that both\nsupport text- and image-conditioned generation. In the first stage, we employ a\nmulti-view diffusion model that efficiently generates multi-view RGB in\napproximately 4 seconds. These multi-view images capture rich details of the 3D\nasset from different viewpoints, relaxing the tasks from single-view to\nmulti-view reconstruction. In the second stage, we introduce a feed-forward\nreconstruction model that rapidly and faithfully reconstructs the 3D asset\ngiven the generated multi-view images in approximately 7 seconds. The\nreconstruction network learns to handle noises and in-consistency introduced by\nthe multi-view diffusion and leverages the available information from the\ncondition image to efficiently recover the 3D structure. Our framework involves\nthe text-to-image model, i.e., Hunyuan-DiT, making it a unified framework to\nsupport both text- and image-conditioned 3D generation. Our standard version\nhas 3x more parameters than our lite and other existing model. Our Hunyuan3D\n1.0 achieves an impressive balance between speed and quality, significantly\nreducing generation time while maintaining the quality and diversity of the\nproduced assets.\n","authors":["Xianghui Yang","Huiwen Shi","Bowen Zhang","Fan Yang","Jiacheng Wang","Hongxu Zhao","Xinhai Liu","Xinzhou Wang","Qingxiang Lin","Jiaao Yu","Lifu Wang","Jing Xu","Zebin He","Zhuo Chen","Sicong Liu","Junta Wu","Yihang Lian","Shaoxiong Yang","Yuhong Liu","Yong Yang","Di Wang","Jie Jiang","Chunchao Guo"],"pdf_url":"https://arxiv.org/pdf/2411.02293v5.pdf","comment":"Technical Report; 3D Generation"},{"id":"http://arxiv.org/abs/2501.13492v1","updated":"2025-01-23T09:14:15Z","published":"2025-01-23T09:14:15Z","title":"Quantized Spike-driven Transformer","summary":" Spiking neural networks are emerging as a promising energy-efficient\nalternative to traditional artificial neural networks due to their spike-driven\nparadigm. However, recent research in the SNN domain has mainly focused on\nenhancing accuracy by designing large-scale Transformer structures, which\ntypically rely on substantial computational resources, limiting their\ndeployment on resource-constrained devices. To overcome this challenge, we\npropose a quantized spike-driven Transformer baseline (QSD-Transformer), which\nachieves reduced resource demands by utilizing a low bit-width parameter.\nRegrettably, the QSD-Transformer often suffers from severe performance\ndegradation. In this paper, we first conduct empirical analysis and find that\nthe bimodal distribution of quantized spike-driven self-attention (Q-SDSA)\nleads to spike information distortion (SID) during quantization, causing\nsignificant performance degradation. To mitigate this issue, we take\ninspiration from mutual information entropy and propose a bi-level optimization\nstrategy to rectify the information distribution in Q-SDSA. Specifically, at\nthe lower level, we introduce an information-enhanced LIF to rectify the\ninformation distribution in Q-SDSA. At the upper level, we propose a\nfine-grained distillation scheme for the QSD-Transformer to align the\ndistribution in Q-SDSA with that in the counterpart ANN. By integrating the\nbi-level optimization strategy, the QSD-Transformer can attain enhanced energy\nefficiency without sacrificing its high-performance advantage.For instance,\nwhen compared to the prior SNN benchmark on ImageNet, the QSD-Transformer\nachieves 80.3\\% top-1 accuracy, accompanied by significant reductions of\n6.0$\\times$ and 8.1$\\times$ in power consumption and model size, respectively.\nCode is available at https://github.com/bollossom/QSD-Transformer.\n","authors":["Xuerui Qiu","Jieyuan Zhang","Wenjie Wei","Honglin Cao","Junsheng Guo","Rui-Jie Zhu","Yimeng Shan","Yang Yang","Malu Zhang","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2501.13492v1.pdf","comment":"Accepted by ICLR 2025"},{"id":"http://arxiv.org/abs/2501.13475v1","updated":"2025-01-23T08:46:39Z","published":"2025-01-23T08:46:39Z","title":"LDR-Net: A Novel Framework for AI-generated Image Detection via\n Localized Discrepancy Representation","summary":" With the rapid advancement of generative models, the visual quality of\ngenerated images has become nearly indistinguishable from the real ones, posing\nchallenges to content authenticity verification. Existing methods for detecting\nAI-generated images primarily focus on specific forgery clues, which are often\ntailored to particular generative models like GANs or diffusion models. These\napproaches struggle to generalize across architectures. Building on the\nobservation that generative images often exhibit local anomalies, such as\nexcessive smoothness, blurred textures, and unnatural pixel variations in small\nregions, we propose the localized discrepancy representation network (LDR-Net),\na novel approach for detecting AI-generated images. LDR-Net captures smoothing\nartifacts and texture irregularities, which are common but often overlooked. It\nintegrates two complementary modules: local gradient autocorrelation (LGA)\nwhich models local smoothing anomalies to detect smoothing anomalies, and local\nvariation pattern (LVP) which captures unnatural regularities by modeling the\ncomplexity of image patterns. By merging LGA and LVP features, a comprehensive\nrepresentation of localized discrepancies can be provided. Extensive\nexperiments demonstrate that our LDR-Net achieves state-of-the-art performance\nin detecting generated images and exhibits satisfactory generalization across\nunseen generative models. The code will be released upon acceptance of this\npaper.\n","authors":["JiaXin Chen","Miao Hu","DengYong Zhang","Yun Song","Xin Liao"],"pdf_url":"https://arxiv.org/pdf/2501.13475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13470v1","updated":"2025-01-23T08:40:54Z","published":"2025-01-23T08:40:54Z","title":"Leveraging Textual Anatomical Knowledge for Class-Imbalanced\n Semi-Supervised Multi-Organ Segmentation","summary":" Annotating 3D medical images demands substantial time and expertise, driving\nthe adoption of semi-supervised learning (SSL) for segmentation tasks. However,\nthe complex anatomical structures of organs often lead to significant class\nimbalances, posing major challenges for deploying SSL in real-world scenarios.\nDespite the availability of valuable prior information, such as inter-organ\nrelative positions and organ shape priors, existing SSL methods have yet to\nfully leverage these insights. To address this gap, we propose a novel approach\nthat integrates textual anatomical knowledge (TAK) into the segmentation model.\nSpecifically, we use GPT-4o to generate textual descriptions of anatomical\npriors, which are then encoded using a CLIP-based model. These encoded priors\nare injected into the segmentation model as parameters of the segmentation\nhead. Additionally, contrastive learning is employed to enhance the alignment\nbetween textual priors and visual features. Extensive experiments demonstrate\nthe superior performance of our method, significantly surpassing\nstate-of-the-art approaches. The source code will be available at:\nhttps://github.com/Lunn88/TAK-Semi.\n","authors":["Yuliang Gu","Weilun Tsao","Bo Du","Thierry Géraud","Yongchao Xu"],"pdf_url":"https://arxiv.org/pdf/2501.13470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13468v1","updated":"2025-01-23T08:33:10Z","published":"2025-01-23T08:33:10Z","title":"Streaming Video Understanding and Multi-round Interaction with\n Memory-enhanced Knowledge","summary":" Recent advances in Large Language Models (LLMs) have enabled the development\nof Video-LLMs, advancing multimodal learning by bridging video data with\nlanguage tasks. However, current video understanding models struggle with\nprocessing long video sequences, supporting multi-turn dialogues, and adapting\nto real-world dynamic scenarios. To address these issues, we propose\nStreamChat, a training-free framework for streaming video reasoning and\nconversational interaction. $\\StreamChat$ leverages a novel hierarchical memory\nsystem to efficiently process and compress video features over extended\nsequences, enabling real-time, multi-turn dialogue. Our framework incorporates\na parallel system scheduling strategy that enhances processing speed and\nreduces latency, ensuring robust performance in real-world applications.\nFurthermore, we introduce StreamBench, a versatile benchmark that evaluates\nstreaming video understanding across diverse media types and interactive\nscenarios, including multi-turn interactions and complex reasoning tasks.\nExtensive evaluations on StreamBench and other public benchmarks demonstrate\nthat StreamChat significantly outperforms existing state-of-the-art models in\nterms of accuracy and response times, confirming its effectiveness for\nstreaming video understanding. Code is available at StreamChat:\nhttps://github.com/hmxiong/StreamChat.\n","authors":["Haomiao Xiong","Zongxin Yang","Jiazuo Yu","Yunzhi Zhuge","Lu Zhang","Jiawen Zhu","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2501.13468v1.pdf","comment":"Accepted to ICLR 2025. Code is available at\n https://github.com/hmxiong/StreamChat"},{"id":"http://arxiv.org/abs/2501.04322v2","updated":"2025-01-23T08:24:29Z","published":"2025-01-08T07:42:54Z","title":"Eve: Efficient Multimodal Vision Language Models with Elastic Visual\n Experts","summary":" Multimodal vision language models (VLMs) have made significant progress with\nthe support of continuously increasing model sizes and data volumes. Running\nVLMs on edge devices has become a challenge for their widespread application.\nThere are several efficient VLM efforts, but they often sacrifice linguistic\ncapabilities to enhance multimodal abilities, or require extensive training. To\naddress this quandary,we introduce the innovative framework of Efficient Vision\nLanguage Models with Elastic Visual Experts (Eve). By strategically\nincorporating adaptable visual expertise at multiple stages of training, Eve\nstrikes a balance between preserving linguistic abilities and augmenting\nmultimodal capabilities. This balanced approach results in a versatile model\nwith only 1.8B parameters that delivers significant improvements in both\nmultimodal and linguistic tasks. Notably, in configurations below 3B\nparameters, Eve distinctly outperforms in language benchmarks and achieves\nstate-of-the-art results 68.87% in VLM Benchmarks. Additionally, its multimodal\naccuracy outstrips that of the larger 7B LLaVA-1.5 model. Our code is available\nat https://github.com/rangmiao/Eve.\n","authors":["Miao Rang","Zhenni Bi","Chuanjian Liu","Yehui Tang","Kai Han","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2501.04322v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13461v1","updated":"2025-01-23T08:23:45Z","published":"2025-01-23T08:23:45Z","title":"Knowledge-Informed Multi-Agent Trajectory Prediction at Signalized\n Intersections for Infrastructure-to-Everything","summary":" Multi-agent trajectory prediction at signalized intersections is crucial for\ndeveloping efficient intelligent transportation systems and safe autonomous\ndriving systems. Due to the complexity of intersection scenarios and the\nlimitations of single-vehicle perception, the performance of vehicle-centric\nprediction methods has reached a plateau. Furthermore, most works underutilize\ncritical intersection information, including traffic signals, and behavior\npatterns induced by road structures. Therefore, we propose a multi-agent\ntrajectory prediction framework at signalized intersections dedicated to\nInfrastructure-to-Everything (I2XTraj). Our framework leverages dynamic graph\nattention to integrate knowledge from traffic signals and driving behaviors. A\ncontinuous signal-informed mechanism is proposed to adaptively process\nreal-time traffic signals from infrastructure devices. Additionally, leveraging\nthe prior knowledge of the intersection topology, we propose a driving strategy\nawareness mechanism to model the joint distribution of goal intentions and\nmaneuvers. To the best of our knowledge, I2XTraj represents the first\nmulti-agent trajectory prediction framework explicitly designed for\ninfrastructure deployment, supplying subscribable prediction services to all\nvehicles at intersections. I2XTraj demonstrates state-of-the-art performance on\nboth the Vehicle-to-Infrastructure dataset V2X-Seq and the aerial-view dataset\nSinD for signalized intersections. Quantitative evaluations show that our\napproach outperforms existing methods by more than 30% in both multi-agent and\nsingle-agent scenarios.\n","authors":["Huilin Yin","Yangwenhui Xu","Jiaxiang Li","Hao Zhang","Gerhard Rigoll"],"pdf_url":"https://arxiv.org/pdf/2501.13461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13452v1","updated":"2025-01-23T08:06:11Z","published":"2025-01-23T08:06:11Z","title":"EchoVideo: Identity-Preserving Human Video Generation by Multimodal\n Feature Fusion","summary":" Recent advancements in video generation have significantly impacted various\ndownstream applications, particularly in identity-preserving video generation\n(IPT2V). However, existing methods struggle with \"copy-paste\" artifacts and low\nsimilarity issues, primarily due to their reliance on low-level facial image\ninformation. This dependence can result in rigid facial appearances and\nartifacts reflecting irrelevant details. To address these challenges, we\npropose EchoVideo, which employs two key strategies: (1) an Identity Image-Text\nFusion Module (IITF) that integrates high-level semantic features from text,\ncapturing clean facial identity representations while discarding occlusions,\nposes, and lighting variations to avoid the introduction of artifacts; (2) a\ntwo-stage training strategy, incorporating a stochastic method in the second\nphase to randomly utilize shallow facial information. The objective is to\nbalance the enhancements in fidelity provided by shallow features while\nmitigating excessive reliance on them. This strategy encourages the model to\nutilize high-level features during training, ultimately fostering a more robust\nrepresentation of facial identities. EchoVideo effectively preserves facial\nidentities and maintains full-body integrity. Extensive experiments demonstrate\nthat it achieves excellent results in generating high-quality, controllability\nand fidelity videos.\n","authors":["Jiangchuan Wei","Shiyue Yan","Wenfeng Lin","Boyuan Liu","Renjie Chen","Mingyu Guo"],"pdf_url":"https://arxiv.org/pdf/2501.13452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13449v1","updated":"2025-01-23T08:02:59Z","published":"2025-01-23T08:02:59Z","title":"MultiDreamer3D: Multi-concept 3D Customization with Concept-Aware\n Diffusion Guidance","summary":" While single-concept customization has been studied in 3D, multi-concept\ncustomization remains largely unexplored. To address this, we propose\nMultiDreamer3D that can generate coherent multi-concept 3D content in a\ndivide-and-conquer manner. First, we generate 3D bounding boxes using an\nLLM-based layout controller. Next, a selective point cloud generator creates\ncoarse point clouds for each concept. These point clouds are placed in the 3D\nbounding boxes and initialized into 3D Gaussian Splatting with concept labels,\nenabling precise identification of concept attributions in 2D projections.\nFinally, we refine 3D Gaussians via concept-aware interval score matching,\nguided by concept-aware diffusion. Our experimental results show that\nMultiDreamer3D not only ensures object presence and preserves the distinct\nidentities of each concept but also successfully handles complex cases such as\nproperty change or interaction. To the best of our knowledge, we are the first\nto address the multi-concept customization in 3D.\n","authors":["Wooseok Song","Seunggyu Chang","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2501.13449v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2312.04215v2","updated":"2025-01-23T08:01:17Z","published":"2023-12-07T11:03:42Z","title":"Guided Reconstruction with Conditioned Diffusion Models for Unsupervised\n Anomaly Detection in Brain MRIs","summary":" The application of supervised models to clinical screening tasks is\nchallenging due to the need for annotated data for each considered pathology.\nUnsupervised Anomaly Detection (UAD) is an alternative approach that aims to\nidentify any anomaly as an outlier from a healthy training distribution. A\nprevalent strategy for UAD in brain MRI involves using generative models to\nlearn the reconstruction of healthy brain anatomy for a given input image. As\nthese models should fail to reconstruct unhealthy structures, the\nreconstruction errors indicate anomalies. However, a significant challenge is\nto balance the accurate reconstruction of healthy anatomy and the undesired\nreplication of abnormal structures. While diffusion models have shown promising\nresults with detailed and accurate reconstructions, they face challenges in\npreserving intensity characteristics, resulting in false positives. We propose\nconditioning the denoising process of diffusion models with additional\ninformation derived from a latent representation of the input image. We\ndemonstrate that this conditioning allows for accurate and local adaptation to\nthe general input intensity distribution while avoiding the replication of\nunhealthy structures. We compare the novel approach to different\nstate-of-the-art methods and for different data sets. Our results show\nsubstantial improvements in the segmentation performance, with the Dice score\nimproved by 11.9%, 20.0%, and 44.6%, for the BraTS, ATLAS and MSLUB data sets,\nrespectively, while maintaining competitive performance on the WMH data set.\nFurthermore, our results indicate effective domain adaptation across different\nMRI acquisitions and simulated contrasts, an important attribute for general\nanomaly detection methods. The code for our work is available at\nhttps://github.com/FinnBehrendt/Conditioned-Diffusion-Models-UAD\n","authors":["Finn Behrendt","Debayan Bhattacharya","Robin Mieling","Lennart Maack","Julia Krüger","Roland Opfer","Alexander Schlaefer"],"pdf_url":"https://arxiv.org/pdf/2312.04215v2.pdf","comment":"Preprint: Accepted paper at Combuters in Biology and medicine"},{"id":"http://arxiv.org/abs/2407.21600v2","updated":"2025-01-23T07:53:34Z","published":"2024-07-31T13:34:14Z","title":"Robust Simultaneous Multislice MRI Reconstruction Using Deep Generative\n Priors","summary":" Simultaneous multislice (SMS) imaging is a powerful technique for\naccelerating magnetic resonance imaging (MRI) acquisitions. However, SMS\nreconstruction remains challenging due to complex signal interactions between\nand within the excited slices. In this study, we introduce ROGER, a robust SMS\nMRI reconstruction method based on deep generative priors. Utilizing denoising\ndiffusion probabilistic models (DDPM), ROGER begins with Gaussian noise and\ngradually recovers individual slices through reverse diffusion iterations while\nenforcing data consistency from measured k-space data within the readout\nconcatenation framework. The posterior sampling procedure is designed such that\nthe DDPM training can be performed on single-slice images without requiring\nmodifications for SMS tasks. Additionally, our method incorporates a\nlow-frequency enhancement (LFE) module to address the practical issue that\nSMS-accelerated fast spin echo (FSE) and echo planar imaging (EPI) sequences\ncannot easily embed fully-sampled autocalibration signals. Extensive\nexperiments on both retrospectively and prospectively accelerated datasets\ndemonstrate that ROGER consistently outperforms existing methods, enhancing\nboth anatomical and functional imaging with strong out-of-distribution\ngeneralization. The source code and sample data for ROGER are available at\nhttps://github.com/Solor-pikachu/ROGER.\n","authors":["Shoujin Huang","Guanxiong Luo","Yunlin Zhao","Yilong Liu","Yuwan Wang","Kexin Yang","Jingzhe Liu","Hua Guo","Min Wang","Lingyan Zhang","Mengye Lyu"],"pdf_url":"https://arxiv.org/pdf/2407.21600v2.pdf","comment":"Submitted to Medical Image Analysis. New fMRI analysis and figures\n are added since v1"},{"id":"http://arxiv.org/abs/2501.13439v1","updated":"2025-01-23T07:46:48Z","published":"2025-01-23T07:46:48Z","title":"One-cycle Structured Pruning with Stability Driven Structure Search","summary":" Existing structured pruning typically involves multi-stage training\nprocedures that often demand heavy computation. Pruning at initialization,\nwhich aims to address this limitation, reduces training costs but struggles\nwith performance. To address these challenges, we propose an efficient\nframework for one-cycle structured pruning without compromising model\nperformance. In this approach, we integrate pre-training, pruning, and\nfine-tuning into a single training cycle, referred to as the `one cycle\napproach'. The core idea is to search for the optimal sub-network during the\nearly stages of network training, guided by norm-based group saliency criteria\nand structured sparsity regularization. We introduce a novel pruning indicator\nthat determines the stable pruning epoch by assessing the similarity between\nevolving pruning sub-networks across consecutive training epochs. Also, group\nsparsity regularization helps to accelerate the pruning process and results in\nspeeding up the entire process. Extensive experiments on datasets, including\nCIFAR-10/100, and ImageNet, using VGGNet, ResNet, MobileNet, and ViT\narchitectures, demonstrate that our method achieves state-of-the-art accuracy\nwhile being one of the most efficient pruning frameworks in terms of training\ntime. The source code will be made publicly available.\n","authors":["Deepak Ghimire","Dayoung Kil","Seonghwan Jeong","Jaesik Park","Seong-heum Kim"],"pdf_url":"https://arxiv.org/pdf/2501.13439v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.13435v1","updated":"2025-01-23T07:43:56Z","published":"2025-01-23T07:43:56Z","title":"GC-ConsFlow: Leveraging Optical Flow Residuals and Global Context for\n Robust Deepfake Detection","summary":" The rapid development of Deepfake technology has enabled the generation of\nhighly realistic manipulated videos, posing severe social and ethical\nchallenges. Existing Deepfake detection methods primarily focused on either\nspatial or temporal inconsistencies, often neglecting the interplay between the\ntwo or suffering from interference caused by natural facial motions. To address\nthese challenges, we propose the global context consistency flow (GC-ConsFlow),\na novel dual-stream framework that effectively integrates spatial and temporal\nfeatures for robust Deepfake detection. The global grouped context aggregation\nmodule (GGCA), integrated into the global context-aware frame flow stream\n(GCAF), enhances spatial feature extraction by aggregating grouped global\ncontext information, enabling the detection of subtle, spatial artifacts within\nframes. The flow-gradient temporal consistency stream (FGTC), rather than\ndirectly modeling the residuals, it is used to improve the robustness of\ntemporal feature extraction against the inconsistency introduced by unnatural\nfacial motion using optical flow residuals and gradient-based features. By\ncombining these two streams, GC-ConsFlow demonstrates the effectiveness and\nrobustness in capturing complementary spatiotemporal forgery traces. Extensive\nexperiments show that GC-ConsFlow outperforms existing state-of-the-art methods\nin detecting Deepfake videos under various compression scenarios.\n","authors":["Jiaxin Chen","Miao Hu","Dengyong Zhang","Jingyang Meng"],"pdf_url":"https://arxiv.org/pdf/2501.13435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04766v3","updated":"2025-01-23T07:43:10Z","published":"2024-09-07T08:53:17Z","title":"Cross-Dataset Gaze Estimation by Evidential Inter-intra Fusion","summary":" Achieving accurate and reliable gaze predictions in complex and diverse\nenvironments remains challenging. Fortunately, it is straightforward to access\ndiverse gaze datasets in real-world applications. We discover that training\nthese datasets jointly can significantly improve the generalization of gaze\nestimation, which is overlooked in previous works. However, due to the inherent\ndistribution shift across different datasets, simply mixing multiple dataset\ndecreases the performance in the original domain despite gaining better\ngeneralization abilities. To address the problem of ``cross-dataset gaze\nestimation'', we propose a novel Evidential Inter-intra Fusion EIF framework,\nfor training a cross-dataset model that performs well across all source and\nunseen domains. Specifically, we build independent single-dataset branches for\nvarious datasets where the data space is partitioned into overlapping subspaces\nwithin each dataset for local regression, and further create a cross-dataset\nbranch to integrate the generalizable features from single-dataset branches.\nFurthermore, evidential regressors based on the Normal and Inverse-Gamma (NIG)\ndistribution are designed to additionally provide uncertainty estimation apart\nfrom predicting gaze. Building upon this foundation, our proposed framework\nachieves both intra-evidential fusion among multiple local regressors within\neach dataset and inter-evidential fusion among multiple branches by Mixture\n\\textbfof Normal Inverse-Gamma (MoNIG distribution. Experiments demonstrate\nthat our method consistently achieves notable improvements in both source\ndomains and unseen domains.\n","authors":["Shijing Wang","Yaping Huang","Jun Xie","Yi Tian","Feng Chen","Zhepeng Wang"],"pdf_url":"https://arxiv.org/pdf/2409.04766v3.pdf","comment":"This paper was previously submitted to ACM MM 2024"},{"id":"http://arxiv.org/abs/2501.13432v1","updated":"2025-01-23T07:35:47Z","published":"2025-01-23T07:35:47Z","title":"Emotion estimation from video footage with LSTM","summary":" Emotion estimation in general is a field that has been studied for a long\ntime, and several approaches exist using machine learning. in this paper, we\npresent an LSTM model, that processes the blend-shapes produced by the library\nMediaPipe, for a face detected in a live stream of a camera, to estimate the\nmain emotion from the facial expressions, this model is trained on the FER2013\ndataset and delivers a result of 71% accuracy and 62% f1-score which meets the\naccuracy benchmark of the FER2013 dataset, with significantly reduced\ncomputation costs. https://github.com/\nSamir-atra/Emotion_estimation_from_video_footage_with_LSTM_ML_algorithm\n","authors":["Samer Attrah"],"pdf_url":"https://arxiv.org/pdf/2501.13432v1.pdf","comment":"11 pages, 6 figures, 32 references, 4 tables"},{"id":"http://arxiv.org/abs/2406.09827v3","updated":"2025-01-23T07:25:28Z","published":"2024-06-14T08:32:45Z","title":"A Training-free Sub-quadratic Cost Transformer Model Serving Framework\n With Hierarchically Pruned Attention","summary":" In modern large language models (LLMs), increasing the context length is\ncrucial for improving comprehension and coherence in long-context, multi-modal,\nand retrieval-augmented language generation. While many recent transformer\nmodels attempt to extend their context length over a million tokens, they\nremain impractical due to the quadratic time and space complexities. Although\nrecent works on linear and sparse attention mechanisms can achieve this goal,\ntheir real-world applicability is often limited by the need to re-train from\nscratch and significantly worse performance. In response, we propose a novel\napproach, Hierarchically Pruned Attention (HiP), which reduces the time\ncomplexity of the attention mechanism to $O(T \\log T)$ and the space complexity\nto $O(T)$, where $T$ is the sequence length. We notice a pattern in the\nattention scores of pretrained LLMs where tokens close together tend to have\nsimilar scores, which we call ``attention locality''. Based on this\nobservation, we utilize a novel tree-search-like algorithm that estimates the\ntop-$k$ key tokens for a given query on the fly, which is mathematically\nguaranteed to have better performance than random attention pruning. In\naddition to improving the time complexity of the attention mechanism, we\nfurther optimize GPU memory usage by implementing KV cache offloading, which\nstores only $O(\\log T)$ tokens on the GPU while maintaining similar decoding\nthroughput. Experiments on benchmarks show that HiP, with its training-free\nnature, significantly reduces both prefill and decoding latencies, as well as\nmemory usage, while maintaining high-quality generation with minimal\ndegradation. HiP enables pretrained LLMs to scale up to millions of tokens on\ncommodity GPUs, potentially unlocking long-context LLM applications previously\ndeemed infeasible.\n","authors":["Heejun Lee","Geon Park","Youngwan Lee","Jaduk Suh","Jina Kim","Wonyoung Jeong","Bumsik Kim","Hyemin Lee","Myeongjae Jeon","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2406.09827v3.pdf","comment":"44 pages"},{"id":"http://arxiv.org/abs/2403.12481v2","updated":"2025-01-23T07:18:03Z","published":"2024-03-19T06:36:42Z","title":"TT-BLIP: Enhancing Fake News Detection Using BLIP and Tri-Transformer","summary":" Detecting fake news has received a lot of attention. Many previous methods\nconcatenate independently encoded unimodal data, ignoring the benefits of\nintegrated multimodal information. Also, the absence of specialized feature\nextraction for text and images further limits these methods. This paper\nintroduces an end-to-end model called TT-BLIP that applies the bootstrapping\nlanguage-image pretraining for unified vision-language understanding and\ngeneration (BLIP) for three types of information: BERT and BLIPTxt for text,\nResNet and BLIPImg for images, and bidirectional BLIP encoders for multimodal\ninformation. The Multimodal Tri-Transformer fuses tri-modal features using\nthree types of multi-head attention mechanisms, ensuring integrated modalities\nfor enhanced representations and improved multimodal data analysis. The\nexperiments are performed using two fake news datasets, Weibo and Gossipcop.\nThe results indicate TT-BLIP outperforms the state-of-the-art models.\n","authors":["Eunjee Choi","Jong-Kook Kim"],"pdf_url":"https://arxiv.org/pdf/2403.12481v2.pdf","comment":"8 pages, Accepted 27th International Conference on Information\n Fusion, FUSION 2024"},{"id":"http://arxiv.org/abs/2501.13426v1","updated":"2025-01-23T07:08:48Z","published":"2025-01-23T07:08:48Z","title":"Auto-Prompting SAM for Weakly Supervised Landslide Extraction","summary":" Weakly supervised landslide extraction aims to identify landslide regions\nfrom remote sensing data using models trained with weak labels, particularly\nimage-level labels. However, it is often challenged by the imprecise boundaries\nof the extracted objects due to the lack of pixel-wise supervision and the\nproperties of landslide objects. To tackle these issues, we propose a simple\nyet effective method by auto-prompting the Segment Anything Model (SAM), i.e.,\nAPSAM. Instead of depending on high-quality class activation maps (CAMs) for\npseudo-labeling or fine-tuning SAM, our method directly yields fine-grained\nsegmentation masks from SAM inference through prompt engineering. Specifically,\nit adaptively generates hybrid prompts from the CAMs obtained by an object\nlocalization network. To provide sufficient information for SAM prompting, an\nadaptive prompt generation (APG) algorithm is designed to fully leverage the\nvisual patterns of CAMs, enabling the efficient generation of pseudo-masks for\nlandslide extraction. These informative prompts are able to identify the extent\nof landslide areas (box prompts) and denote the centers of landslide objects\n(point prompts), guiding SAM in landslide segmentation. Experimental results on\nhigh-resolution aerial and satellite datasets demonstrate the effectiveness of\nour method, achieving improvements of at least 3.0\\% in F1 score and 3.69\\% in\nIoU compared to other state-of-the-art methods. The source codes and datasets\nwill be available at https://github.com/zxk688.\n","authors":["Jian Wang","Xiaokang Zhang","Xianping Ma","Weikang Yu","Pedram Ghamisi"],"pdf_url":"https://arxiv.org/pdf/2501.13426v1.pdf","comment":"5 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.13422v1","updated":"2025-01-23T06:53:35Z","published":"2025-01-23T06:53:35Z","title":"Atmospheric Noise-Resilient Image Classification in a Real-World\n Scenario: Using Hybrid CNN and Pin-GTSVM","summary":" Parking space occupation detection using deep learning frameworks has seen\nsignificant advancements over the past few years. While these approaches\neffectively detect partial obstructions and adapt to varying lighting\nconditions, their performance significantly diminishes when haze is present.\nThis paper proposes a novel hybrid model with a pre-trained feature extractor\nand a Pinball Generalized Twin Support Vector Machine (Pin-GTSVM) classifier,\nwhich removes the need for a dehazing system from the current State-of-The-Art\nhazy parking slot classification systems and is also insensitive to any\natmospheric noise. The proposed system can seamlessly integrate with\nconventional smart parking infrastructures, leveraging a minimal number of\ncameras to monitor and manage hundreds of parking spaces efficiently. Its\neffectiveness has been evaluated against established parking space detection\nmethods using the CNRPark Patches, PKLot, and a custom dataset specific to hazy\nparking scenarios. Furthermore, empirical results indicate a significant\nimprovement in accuracy on a hazy parking system, thus emphasizing efficient\natmospheric noise handling.\n","authors":["Shlok Mehendale","Jajati Keshari Sahoo","Rajendra Kumar Roul"],"pdf_url":"https://arxiv.org/pdf/2501.13422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02483v3","updated":"2025-01-23T06:52:04Z","published":"2024-09-04T07:20:01Z","title":"TASAR: Transfer-based Attack on Skeletal Action Recognition","summary":" Skeletal sequences, as well-structured representations of human behaviors,\nplay a vital role in Human Activity Recognition (HAR). The transferability of\nadversarial skeletal sequences enables attacks in real-world HAR scenarios,\nsuch as autonomous driving, intelligent surveillance, and human-computer\ninteractions. However, most existing skeleton-based HAR (S-HAR) attacks are\nprimarily designed for white-box scenarios and exhibit weak adversarial\ntransferability. Therefore, they cannot be considered true transfer-based S-HAR\nattacks. More importantly, the reason for this failure remains unclear. In this\npaper, we study this phenomenon through the lens of loss surface, and find that\nits sharpness contributes to the weak transferability in S-HAR. Inspired by\nthis observation, we assume and empirically validate that smoothening the\nrugged loss landscape could potentially improve adversarial transferability in\nS-HAR. To this end, we propose the first \\textbf{T}ransfer-based\n\\textbf{A}ttack on \\textbf{S}keletal \\textbf{A}ction \\textbf{R}ecognition,\nTASAR. TASAR explores the smoothed model posterior without requiring surrogate\nre-training, which is achieved by a new post-train Dual Bayesian optimization\nstrategy. Furthermore, unlike previous transfer-based attacks that treat each\nframe independently and overlook temporal coherence within sequences, TASAR\nincorporates motion dynamics into the Bayesian attack gradient, effectively\ndisrupting the spatial-temporal coherence of S-HARs. To exhaustively evaluate\nthe effectiveness of existing methods and our method, we build the first\nlarge-scale robust S-HAR benchmark, comprising 7 S-HAR models, 10 attack\nmethods, 3 S-HAR datasets and 2 defense methods. Extensive results demonstrate\nthe superiority of TASAR. Our benchmark enables easy comparisons for future\nstudies, with the code available in the supplementary material.\n","authors":["Yunfeng Diao","Baiqi Wu","Ruixuan Zhang","Ajian Liu","Xingxing Wei","Meng Wang","He Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02483v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2407.08572"},{"id":"http://arxiv.org/abs/2501.13420v1","updated":"2025-01-23T06:48:48Z","published":"2025-01-23T06:48:48Z","title":"LVFace: Large Vision model for Face Recogniton","summary":" Recently, large vision models have demonstrated powerful representation\ncapabilities in the field of computer vision. However, we unexpectedly found\nthat face recognition research is still mainly focused on CNN-based model\narchitectures, which may lead to suboptimal state-of-the-art (SOTA) performance\nin face recognition. Therefore, we study how to use various loss functions from\nhistorical research orthogonally to train a new state-of-the-art face\nrecognition model based on large vision models, called LVFace. On the largest\npublic face database, WebFace42M, we demonstrated the superiority of LVFace\nover other advanced face recognition methods and achieved first place in the\nICCV21 MFR-Ongoing challenge, until the submission of this work (December 30,\n2024, academic track).\n","authors":["Jinghan You","Yuanrui Sun","Mingyu Guo","Chao Feng","Jiao Ran"],"pdf_url":"https://arxiv.org/pdf/2501.13420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13418v1","updated":"2025-01-23T06:45:17Z","published":"2025-01-23T06:45:17Z","title":"Rethinking the Sample Relations for Few-Shot Classification","summary":" Feature quality is paramount for classification performance, particularly in\nfew-shot scenarios. Contrastive learning, a widely adopted technique for\nenhancing feature quality, leverages sample relations to extract intrinsic\nfeatures that capture semantic information and has achieved remarkable success\nin Few-Shot Learning (FSL). Nevertheless, current few-shot contrastive learning\napproaches often overlook the semantic similarity discrepancies at different\ngranularities when employing the same modeling approach for different sample\nrelations, which limits the potential of few-shot contrastive learning. In this\npaper, we introduce a straightforward yet effective contrastive learning\napproach, Multi-Grained Relation Contrastive Learning (MGRCL), as a\npre-training feature learning model to boost few-shot learning by meticulously\nmodeling sample relations at different granularities. MGRCL categorizes sample\nrelations into three types: intra-sample relation of the same sample under\ndifferent transformations, intra-class relation of homogenous samples, and\ninter-class relation of inhomogeneous samples. In MGRCL, we design\nTransformation Consistency Learning (TCL) to ensure the rigorous semantic\nconsistency of a sample under different transformations by aligning predictions\nof input pairs. Furthermore, to preserve discriminative information, we employ\nClass Contrastive Learning (CCL) to ensure that a sample is always closer to\nits homogenous samples than its inhomogeneous ones, as homogenous samples share\nsimilar semantic content while inhomogeneous samples have different semantic\ncontent. Our method is assessed across four popular FSL benchmarks, showing\nthat such a simple pre-training feature learning method surpasses a majority of\nleading FSL methods. Moreover, our method can be incorporated into other FSL\nmethods as the pre-trained model and help them obtain significant performance\ngains.\n","authors":["Guowei Yin","Sheng Huang","Luwen Huangfu","Yi Zhang","Xiaohong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.13418v1.pdf","comment":"32 pages"},{"id":"http://arxiv.org/abs/2501.13417v1","updated":"2025-01-23T06:43:38Z","published":"2025-01-23T06:43:38Z","title":"GeomGS: LiDAR-Guided Geometry-Aware Gaussian Splatting for Robot\n Localization","summary":" Mapping and localization are crucial problems in robotics and autonomous\ndriving. Recent advances in 3D Gaussian Splatting (3DGS) have enabled precise\n3D mapping and scene understanding by rendering photo-realistic images.\nHowever, existing 3DGS methods often struggle to accurately reconstruct a 3D\nmap that reflects the actual scale and geometry of the real world, which\ndegrades localization performance. To address these limitations, we propose a\nnovel 3DGS method called Geometry-Aware Gaussian Splatting (GeomGS). This\nmethod fully integrates LiDAR data into 3D Gaussian primitives via a\nprobabilistic approach, as opposed to approaches that only use LiDAR as initial\npoints or introduce simple constraints for Gaussian points. To this end, we\nintroduce a Geometric Confidence Score (GCS), which identifies the structural\nreliability of each Gaussian point. The GCS is optimized simultaneously with\nGaussians under probabilistic distance constraints to construct a precise\nstructure. Furthermore, we propose a novel localization method that fully\nutilizes both the geometric and photometric properties of GeomGS. Our GeomGS\ndemonstrates state-of-the-art geometric and localization performance across\nseveral benchmarks, while also improving photometric performance.\n","authors":["Jaewon Lee","Mangyu Kong","Minseong Park","Euntai Kim"],"pdf_url":"https://arxiv.org/pdf/2501.13417v1.pdf","comment":"Preprint, Under review"},{"id":"http://arxiv.org/abs/2408.01167v4","updated":"2025-01-23T06:30:53Z","published":"2024-08-02T10:34:23Z","title":"Rethinking Pre-Trained Feature Extractor Selection in Multiple Instance\n Learning for Whole Slide Image Classification","summary":" Multiple instance learning (MIL) has become a preferred method for gigapixel\nwhole slide image (WSI) classification without requiring patch-level\nannotations. Current MIL research primarily relies on embedding-based\napproaches, which extract patch features using a pre-trained feature extractor\nand aggregate them for slide-level prediction. Despite the critical role of\nfeature extraction, there is limited guidance on selecting optimal feature\nextractors to maximize WSI performance. This study addresses this gap by\nsystematically evaluating MIL feature extractors across three dimensions:\npre-training dataset, backbone model, and pre-training method. Extensive\nexperiments were conducted on two public WSI datasets (TCGA-NSCLC and\nCamelyon16) using four state-of-the-art (SOTA) MIL models. Our findings reveal\nthat: 1) selecting a robust self-supervised learning (SSL) method has a greater\nimpact on performance than relying solely on an in-domain pre-training dataset;\n2) prioritizing Transformer-based backbones with deeper architectures over\nCNN-based models; and 3) using larger, more diverse pre-training datasets\nsignificantly enhances classification outcomes. We hope that these insights can\nprovide practical guidance for optimizing WSI classification and explain the\nreasons behind the performance advantages of the current SOTA pathology\nfoundation models. Furthermore, this work may inform the development of more\neffective pathology foundation models. Our code is publicly available at\nhttps://github.com/bryanwong17/MIL-Feature-Extractor-Selection\n","authors":["Bryan Wong","Mun Yong Yi"],"pdf_url":"https://arxiv.org/pdf/2408.01167v4.pdf","comment":"Accepted to IEEE International Symposium on Biomedical Imaging (ISBI)\n 2025"},{"id":"http://arxiv.org/abs/2411.01988v4","updated":"2025-01-23T06:19:34Z","published":"2024-11-04T11:20:17Z","title":"QCS: Feature Refining from Quadruplet Cross Similarity for Facial\n Expression Recognition","summary":" Facial expression recognition faces challenges where labeled significant\nfeatures in datasets are mixed with unlabeled redundant ones. In this paper, we\nintroduce Cross Similarity Attention (CSA) to mine richer intrinsic information\nfrom image pairs, overcoming a limitation when the Scaled Dot-Product Attention\nof ViT is directly applied to calculate the similarity between two different\nimages. Based on CSA, we simultaneously minimize intra-class differences and\nmaximize inter-class differences at the fine-grained feature level through\ninteractions among multiple branches. Contrastive residual distillation is\nutilized to transfer the information learned in the cross module back to the\nbase network. We ingeniously design a four-branch centrally symmetric network,\nnamed Quadruplet Cross Similarity (QCS), which alleviates gradient conflicts\narising from the cross module and achieves balanced and stable training. It can\nadaptively extract discriminative features while isolating redundant ones. The\ncross-attention modules exist during training, and only one base branch is\nretained during inference, resulting in no increase in inference time.\nExtensive experiments show that our proposed method achieves state-of-the-art\nperformance on several FER datasets.\n","authors":["Chengpeng Wang","Li Chen","Lili Wang","Zhaofan Li","Xuebin Lv"],"pdf_url":"https://arxiv.org/pdf/2411.01988v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13402v1","updated":"2025-01-23T06:01:03Z","published":"2025-01-23T06:01:03Z","title":"VIGS SLAM: IMU-based Large-Scale 3D Gaussian Splatting SLAM","summary":" Recently, map representations based on radiance fields such as 3D Gaussian\nSplatting and NeRF, which excellent for realistic depiction, have attracted\nconsiderable attention, leading to attempts to combine them with SLAM. While\nthese approaches can build highly realistic maps, large-scale SLAM still\nremains a challenge because they require a large number of Gaussian images for\nmapping and adjacent images as keyframes for tracking. We propose a novel 3D\nGaussian Splatting SLAM method, VIGS SLAM, that utilizes sensor fusion of RGB-D\nand IMU sensors for large-scale indoor environments. To reduce the\ncomputational load of 3DGS-based tracking, we adopt an ICP-based tracking\nframework that combines IMU preintegration to provide a good initial guess for\naccurate pose estimation. Our proposed method is the first to propose that\nGaussian Splatting-based SLAM can be effectively performed in large-scale\nenvironments by integrating IMU sensor measurements. This proposal not only\nenhances the performance of Gaussian Splatting SLAM beyond room-scale scenarios\nbut also achieves SLAM performance comparable to state-of-the-art methods in\nlarge-scale indoor environments.\n","authors":["Gyuhyeon Pak","Euntai Kim"],"pdf_url":"https://arxiv.org/pdf/2501.13402v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.13400v1","updated":"2025-01-23T05:57:13Z","published":"2025-01-23T05:57:13Z","title":"YOLOv8 to YOLO11: A Comprehensive Architecture In-depth Comparative\n Review","summary":" In the field of deep learning-based computer vision, YOLO is revolutionary.\nWith respect to deep learning models, YOLO is also the one that is evolving the\nmost rapidly. Unfortunately, not every YOLO model possesses scholarly\npublications. Moreover, there exists a YOLO model that lacks a publicly\naccessible official architectural diagram. Naturally, this engenders\nchallenges, such as complicating the understanding of how the model operates in\npractice. Furthermore, the review articles that are presently available do not\ndelve into the specifics of each model. The objective of this study is to\npresent a comprehensive and in-depth architecture comparison of the four most\nrecent YOLO models, specifically YOLOv8 through YOLO11, thereby enabling\nreaders to quickly grasp not only how each model functions, but also the\ndistinctions between them. To analyze each YOLO version's architecture, we\nmeticulously examined the relevant academic papers, documentation, and\nscrutinized the source code. The analysis reveals that while each version of\nYOLO has improvements in architecture and feature extraction, certain blocks\nremain unchanged. The lack of scholarly publications and official diagrams\npresents challenges for understanding the model's functionality and future\nenhancement. Future developers are encouraged to provide these resources.\n","authors":["Priyanto Hidayatullah","Nurjannah Syakrani","Muhammad Rizqi Sholahuddin","Trisna Gelar","Refdinal Tubagus"],"pdf_url":"https://arxiv.org/pdf/2501.13400v1.pdf","comment":"submitted to Journal of Applied Engineering and Technological Science"},{"id":"http://arxiv.org/abs/2405.14318v3","updated":"2025-01-23T05:54:28Z","published":"2024-05-23T08:43:09Z","title":"Adaptive Retention & Correction for Continual Learning","summary":" Continual learning, also known as lifelong learning or incremental learning,\nrefers to the process by which a model learns from a stream of incoming data\nover time. A common problem in continual learning is the classification layer's\nbias towards the most recent task. Traditionally, methods have relied on\nincorporating data from past tasks during training to mitigate this issue.\nHowever, the recent shift in continual learning to memory-free environments has\nrendered these approaches infeasible. In this study, we propose a solution\nfocused on the testing phase. We first introduce a simple Out-of-Task Detection\nmethod, OTD, designed to accurately identify samples from past tasks during\ntesting. Leveraging OTD, we then propose: (1) an Adaptive Retention mechanism\nfor dynamically tuning the classifier layer on past task data; (2) an Adaptive\nCorrection mechanism for revising predictions when the model classifies data\nfrom previous tasks into classes from the current task. We name our approach\nAdaptive Retention & Correction (ARC). While designed for memory-free\nenvironments, ARC also proves effective in memory-based settings. Extensive\nexperiments show that our proposed method can be plugged in to virtually any\nexisting continual learning approach without requiring any modifications to its\ntraining procedure. Specifically, when integrated with state-of-the-art\napproaches, ARC achieves an average performance increase of 2.7% and 2.6% on\nthe CIFAR-100 and Imagenet-R datasets, respectively.\n","authors":["Haoran Chen","Micah Goldblum","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2405.14318v3.pdf","comment":"Accepted to ICLR 2025"},{"id":"http://arxiv.org/abs/2411.17354v2","updated":"2025-01-23T05:52:26Z","published":"2024-11-26T11:57:20Z","title":"DWCL: Dual-Weighted Contrastive Learning for Multi-View Clustering","summary":" Multi-view contrastive clustering (MVCC) has gained significant attention for\ngenerating consistent clustering structures from multiple views through\ncontrastive learning. However, most existing MVCC methods create cross-views by\ncombining any two views, leading to a high volume of unreliable pairs.\nFurthermore, these approaches often overlook discrepancies in multi-view\nrepresentations, resulting in representation degeneration. To address these\nchallenges, we introduce a novel model called Dual-Weighted Contrastive\nLearning (DWCL) for Multi-View Clustering. Specifically, to reduce the impact\nof unreliable cross-views, we introduce an innovative Best-Other (B-O)\ncontrastive mechanism that enhances the representation of individual views at a\nlow computational cost. Furthermore, we develop a dual weighting strategy that\ncombines a view quality weight, reflecting the quality of each view, with a\nview discrepancy weight. This approach effectively mitigates representation\ndegeneration by downplaying cross-views that are both low in quality and high\nin discrepancy. We theoretically validate the efficiency of the B-O contrastive\nmechanism and the effectiveness of the dual weighting strategy. Extensive\nexperiments demonstrate that DWCL outperforms previous methods across eight\nmulti-view datasets, showcasing superior performance and robustness in MVCC.\nSpecifically, our method achieves absolute accuracy improvements of 5.4\\% and\n5.6\\% compared to state-of-the-art methods on the Caltech6V7 and MSRCv1\ndatasets, respectively.\n","authors":["Hanning Yuan","Zhihui Zhang","Qi Guo","Lianhua Chi","Sijie Ruan","Jinhui Pang","Xiaoshuai Hao"],"pdf_url":"https://arxiv.org/pdf/2411.17354v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04561v3","updated":"2025-01-23T05:51:07Z","published":"2025-01-08T15:18:09Z","title":"OpenOmni: Large Language Models Pivot Zero-shot Omnimodal Alignment\n across Language with Real-time Self-Aware Emotional Speech Synthesis","summary":" Recent advancements in omnimodal learning have been achieved in understanding\nand generation across images, text, and speech, though mainly within\nproprietary models. Limited omnimodal datasets and the inherent challenges\nassociated with real-time emotional speech generation have hindered open-source\nprogress. To address these issues, we propose openomni, a two-stage training\nmethod combining omnimodal alignment and speech generation to develop a\nstate-of-the-art omnimodal large language model. In the alignment phase, a\npre-trained speech model is further trained on text-image tasks to generalize\nfrom vision to speech in a (near) zero-shot manner, outperforming models\ntrained on tri-modal datasets. In the speech generation phase, a lightweight\ndecoder facilitates real-time emotional speech through training on speech tasks\nand preference learning. Experiments demonstrate that openomni consistently\nimproves across omnimodal, vision-language, and speech-language evaluations,\nenabling natural, emotion-rich dialogues and real-time emotional speech\ngeneration.\n","authors":["Run Luo","Ting-En Lin","Haonan Zhang","Yuchuan Wu","Xiong Liu","Min Yang","Yongbin Li","Longze Chen","Jiaming Li","Lei Zhang","Yangyi Chen","Hamid Alinejad-Rokny","Fei Huang"],"pdf_url":"https://arxiv.org/pdf/2501.04561v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13396v1","updated":"2025-01-23T05:46:08Z","published":"2025-01-23T05:46:08Z","title":"Towards Intelligent Design: A Self-driven Framework for Collocated\n Clothing Synthesis Leveraging Fashion Styles and Textures","summary":" Collocated clothing synthesis (CCS) has emerged as a pivotal topic in fashion\ntechnology, primarily concerned with the generation of a clothing item that\nharmoniously matches a given item. However, previous investigations have relied\non using paired outfits, such as a pair of matching upper and lower clothing,\nto train a generative model for achieving this task. This reliance on the\nexpertise of fashion professionals in the construction of such paired outfits\nhas engendered a laborious and time-intensive process. In this paper, we\nintroduce a new self-driven framework, named style- and texture-guided\ngenerative network (ST-Net), to synthesize collocated clothing without the\nnecessity for paired outfits, leveraging self-supervised learning. ST-Net is\ndesigned to extrapolate fashion compatibility rules from the style and texture\nattributes of clothing, using a generative adversarial network. To facilitate\nthe training and evaluation of our model, we have constructed a large-scale\ndataset specifically tailored for unsupervised CCS. Extensive experiments\nsubstantiate that our proposed method outperforms the state-of-the-art\nbaselines in terms of both visual authenticity and fashion compatibility.\n","authors":["Minglong Dong","Dongliang Zhou","Jianghong Ma","Haijun Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.13396v1.pdf","comment":"This paper has been accepted for presentation at ICASSP 2024"},{"id":"http://arxiv.org/abs/2501.13389v1","updated":"2025-01-23T05:19:00Z","published":"2025-01-23T05:19:00Z","title":"AEON: Adaptive Estimation of Instance-Dependent In-Distribution and\n Out-of-Distribution Label Noise for Robust Learning","summary":" Robust training with noisy labels is a critical challenge in image\nclassification, offering the potential to reduce reliance on costly clean-label\ndatasets. Real-world datasets often contain a mix of in-distribution (ID) and\nout-of-distribution (OOD) instance-dependent label noise, a challenge that is\nrarely addressed simultaneously by existing methods and is further compounded\nby the lack of comprehensive benchmarking datasets. Furthermore, even though\ncurrent noisy-label learning approaches attempt to find noisy-label samples\nduring training, these methods do not aim to estimate ID and OOD noise rates to\npromote their effectiveness in the selection of such noisy-label samples, and\nthey are often represented by inefficient multi-stage learning algorithms. We\npropose the Adaptive Estimation of Instance-Dependent In-Distribution and\nOut-of-Distribution Label Noise (AEON) approach to address these research gaps.\nAEON is an efficient one-stage noisy-label learning methodology that\ndynamically estimates instance-dependent ID and OOD label noise rates to\nenhance robustness to complex noise settings. Additionally, we introduce a new\nbenchmark reflecting real-world ID and OOD noise scenarios. Experiments\ndemonstrate that AEON achieves state-of-the-art performance on both synthetic\nand real-world datasets\n","authors":["Arpit Garg","Cuong Nguyen","Rafael Felix","Yuyuan Liu","Thanh-Toan Do","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2501.13389v1.pdf","comment":"In Submission"},{"id":"http://arxiv.org/abs/2501.13387v1","updated":"2025-01-23T05:15:10Z","published":"2025-01-23T05:15:10Z","title":"From Images to Point Clouds: An Efficient Solution for Cross-media Blind\n Quality Assessment without Annotated Training","summary":" We present a novel quality assessment method which can predict the perceptual\nquality of point clouds from new scenes without available annotations by\nleveraging the rich prior knowledge in images, called the Distribution-Weighted\nImage-Transferred Point Cloud Quality Assessment (DWIT-PCQA). Recognizing the\nhuman visual system (HVS) as the decision-maker in quality assessment\nregardless of media types, we can emulate the evaluation criteria for human\nperception via neural networks and further transfer the capability of quality\nprediction from images to point clouds by leveraging the prior knowledge in the\nimages. Specifically, domain adaptation (DA) can be leveraged to bridge the\nimages and point clouds by aligning feature distributions of the two media in\nthe same feature space. However, the different manifestations of distortions in\nimages and point clouds make feature alignment a difficult task. To reduce the\nalignment difficulty and consider the different distortion distribution during\nalignment, we have derived formulas to decompose the optimization objective of\nthe conventional DA into two suboptimization functions with distortion as a\ntransition. Specifically, through network implementation, we propose the\ndistortion-guided biased feature alignment which integrates existing/estimated\ndistortion distribution into the adversarial DA framework, emphasizing common\ndistortion patterns during feature alignment. Besides, we propose the\nquality-aware feature disentanglement to mitigate the destruction of the\nmapping from features to quality during alignment with biased distortions.\nExperimental results demonstrate that our proposed method exhibits reliable\nperformance compared to general blind PCQA methods without needing point cloud\nannotations.\n","authors":["Yipeng Liu","Qi Yang","Yujie Zhang","Yiling Xu","Le Yang","Zhu Li"],"pdf_url":"https://arxiv.org/pdf/2501.13387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06165v6","updated":"2025-01-23T04:58:05Z","published":"2024-02-09T03:48:20Z","title":"Learning Contrastive Feature Representations for Facial Action Unit\n Detection","summary":" For the Facial Action Unit (AU) detection task, accurately capturing the\nsubtle facial differences between distinct AUs is essential for reliable\ndetection. Additionally, AU detection faces challenges from class imbalance and\nthe presence of noisy or false labels, which undermine detection accuracy. In\nthis paper, we introduce a novel contrastive learning framework aimed for AU\ndetection that incorporates both self-supervised and supervised signals,\nthereby enhancing the learning of discriminative features for accurate AU\ndetection. To tackle the class imbalance issue, we employ a negative sample\nre-weighting strategy that adjusts the step size of updating parameters for\nminority and majority class samples. Moreover, to address the challenges posed\nby noisy and false AU labels, we employ a sampling technique that encompasses\nthree distinct types of positive sample pairs. This enables us to inject\nself-supervised signals into the supervised signal, effectively mitigating the\nadverse effects of noisy labels. Our experimental assessments, conducted on\nfive widely-utilized benchmark datasets (BP4D, DISFA, BP4D+, GFT and\nAff-Wild2), underscore the superior performance of our approach compared to\nstate-of-the-art methods of AU detection.\n","authors":["Ziqiao Shang","Bin Liu","Fengmao Lv","Fei Teng","Tianrui Li"],"pdf_url":"https://arxiv.org/pdf/2402.06165v6.pdf","comment":"35 pages, 20 figures, submitted to Pattern Recognition (PR)"},{"id":"http://arxiv.org/abs/2411.17807v2","updated":"2025-01-23T04:46:30Z","published":"2024-11-26T19:00:01Z","title":"A solvable generative model with a linear, one-step denoiser","summary":" We develop an analytically tractable single-step diffusion model based on a\nlinear denoiser and present explicit formula for the Kullback-Leibler\ndivergence between generated and sampling distribution, taken to be isotropic\nGaussian, showing the effect of finite diffusion time and noise scale. Our\nstudy further reveals that the monotonic fall phase of Kullback-Leibler\ndivergence begins when the training dataset size reaches the dimension of the\ndata points. Along the way, we provide a mathematically precise definition of\nmemorization to non-memorization transition when only finite number of data\npoints are available. It is shown that the simplified model also features this\ntransition during the monotonic fall phase of the aforementioned\nKullback-Leibler divergence. For large-scale practical diffusion models, we\nexplain why higher number of diffusion steps enhance production quality based\non the theoretical arguments presented before. In addition, we show that higher\ndiffusion steps does not necessarily help in reducing memorization. These two\nfacts combined suggests existence of an optimal number of diffusion steps for\nfinite number of training samples.\n","authors":["Indranil Halder"],"pdf_url":"https://arxiv.org/pdf/2411.17807v2.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2310.05026v2","updated":"2025-01-23T04:45:10Z","published":"2023-10-08T06:10:09Z","title":"Low-Resolution Self-Attention for Semantic Segmentation","summary":" Semantic segmentation tasks naturally require high-resolution information for\npixel-wise segmentation and global context information for class prediction.\nWhile existing vision transformers demonstrate promising performance, they\noften utilize high-resolution context modeling, resulting in a computational\nbottleneck. In this work, we challenge conventional wisdom and introduce the\nLow-Resolution Self-Attention (LRSA) mechanism to capture global context at a\nsignificantly reduced computational cost, i.e., FLOPs. Our approach involves\ncomputing self-attention in a fixed low-resolution space regardless of the\ninput image's resolution, with additional 3x3 depth-wise convolutions to\ncapture fine details in the high-resolution space. We demonstrate the\neffectiveness of our LRSA approach by building the LRFormer, a vision\ntransformer with an encoder-decoder structure. Extensive experiments on the\nADE20K, COCO-Stuff, and Cityscapes datasets demonstrate that LRFormer\noutperforms state-of-the-art models. he code is available at\nhttps://github.com/yuhuan-wu/LRFormer.\n","authors":["Yu-Huan Wu","Shi-Chen Zhang","Yun Liu","Le Zhang","Xin Zhan","Daquan Zhou","Jiashi Feng","Ming-Ming Cheng","Liangli Zhen"],"pdf_url":"https://arxiv.org/pdf/2310.05026v2.pdf","comment":"added many experiments. 13 pages, 12 tables, 6 figures"},{"id":"http://arxiv.org/abs/2501.13376v1","updated":"2025-01-23T04:41:20Z","published":"2025-01-23T04:41:20Z","title":"Scalable Evaluation Framework for Foundation Models in Musculoskeletal\n MRI Bridging Computational Innovation with Clinical Utility","summary":" Foundation models hold transformative potential for medical imaging, but\ntheir clinical utility requires rigorous evaluation to address their strengths\nand limitations. This study introduces an evaluation framework for assessing\nthe clinical impact and translatability of SAM, MedSAM, and SAM2, using\nmusculoskeletal MRI as a case study. We tested these models across zero-shot\nand finetuned paradigms to assess their ability to process diverse anatomical\nstructures and effectuate clinically reliable biomarkers, including cartilage\nthickness, muscle volume, and disc height. We engineered a modular pipeline\nemphasizing scalability, clinical relevance, and workflow integration, reducing\nmanual effort and aligning validation with end-user expectations. Hierarchical\nmodeling revealed how dataset mixing, anatomical complexity, and MRI\nacquisition parameters influence performance, providing insights into the role\nof imaging refinements in improving segmentation accuracy. This work\ndemonstrates how clinically focused evaluations can connect computational\nadvancements with tangible applications, creating a pathway for foundation\nmodels to address medical challenges. By emphasizing interdisciplinary\ncollaboration and aligning technical innovation with clinical priorities, our\nframework provides a roadmap for advancing machine learning technologies into\nscalable and impactful biomedical solutions.\n","authors":["Gabrielle Hoyer","Michelle W Tong","Rupsa Bhattacharjee","Valentina Pedoia","Sharmila Majumdar"],"pdf_url":"https://arxiv.org/pdf/2501.13376v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13370v1","updated":"2025-01-23T04:17:20Z","published":"2025-01-23T04:17:20Z","title":"Unraveling Normal Anatomy via Fluid-Driven Anomaly Randomization","summary":" Data-driven machine learning has made significant strides in medical image\nanalysis. However, most existing methods are tailored to specific modalities\nand assume a particular resolution (often isotropic). This limits their\ngeneralizability in clinical settings, where variations in scan appearance\narise from differences in sequence parameters, resolution, and orientation.\nFurthermore, most general-purpose models are designed for healthy subjects and\nsuffer from performance degradation when pathology is present. We introduce UNA\n(Unraveling Normal Anatomy), the first modality-agnostic learning approach for\nnormal brain anatomy reconstruction that can handle both healthy scans and\ncases with pathology. We propose a fluid-driven anomaly randomization method\nthat generates an unlimited number of realistic pathology profiles on-the-fly.\nUNA is trained on a combination of synthetic and real data, and can be applied\ndirectly to real images with potential pathology without the need for\nfine-tuning. We demonstrate UNA's effectiveness in reconstructing healthy brain\nanatomy and showcase its direct application to anomaly detection, using both\nsimulated and real images from 3D healthy and stroke datasets, including CT and\nMRI scans. By bridging the gap between healthy and diseased images, UNA enables\nthe use of general-purpose models on diseased images, opening up new\nopportunities for large-scale analysis of uncurated clinical images in the\npresence of pathology. Code is available at https://github.com/peirong26/UNA.\n","authors":["Peirong Liu","Ana Lawry Aguila","Juan E. Iglesias"],"pdf_url":"https://arxiv.org/pdf/2501.13370v1.pdf","comment":"16 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.13368v1","updated":"2025-01-23T04:14:59Z","published":"2025-01-23T04:14:59Z","title":"Meta-Feature Adapter: Integrating Environmental Metadata for Enhanced\n Animal Re-identification","summary":" Identifying individual animals within large wildlife populations is essential\nfor effective wildlife monitoring and conservation efforts. Recent advancements\nin computer vision have shown promise in animal re-identification (Animal ReID)\nby leveraging data from camera traps. However, existing methods rely\nexclusively on visual data, neglecting environmental metadata that ecologists\nhave identified as highly correlated with animal behavior and identity, such as\ntemperature and circadian rhythms. To bridge this gap, we propose the\nMeta-Feature Adapter (MFA), a lightweight module designed to integrate\nenvironmental metadata into vision-language foundation models, such as CLIP, to\nenhance Animal ReID performance. Our approach translates environmental metadata\ninto natural language descriptions, encodes them into metadata-aware text\nembeddings, and incorporates these embeddings into image features through a\ncross-attention mechanism. Furthermore, we introduce a Gated Cross-Attention\nmechanism that dynamically adjusts the weights of metadata contributions,\nfurther improving performance. To validate our approach, we constructed the\nMetadata Augmented Animal Re-identification (MAAR) dataset, encompassing six\nspecies from New Zealand and featuring paired image data and environmental\nmetadata. Extensive experiments demonstrate that MFA consistently improves\nAnimal ReID performance across multiple baseline models.\n","authors":["Yuzhuo Li","Di Zhao","Yihao Wu","Yun Sing Koh"],"pdf_url":"https://arxiv.org/pdf/2501.13368v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.13365v1","updated":"2025-01-23T04:10:31Z","published":"2025-01-23T04:10:31Z","title":"Enhanced Extractor-Selector Framework and Symmetrization Weighted Binary\n Cross-Entropy for Edge Detections","summary":" Recent advancements have demonstrated the effectiveness of the\nextractor-selector (E-S) framework in edge detection (ED) tasks, which achieves\nstate-of-the-art (SOTA) performance in both quantitative metrics and perceptual\nquality. However, this method still falls short of fully exploiting the\npotential of feature extractors, as selectors only operate on highly compressed\nfeature maps that lack diversity and suffer from substantial information loss.\nAdditionally, while union training can improve perceptual quality, the highest\nevaluation scores are typically obtained without it, creating a trade-off\nbetween quantitative accuracy and perceptual fidelity. To address these\nlimitations, we propose an enhanced E-S architecture, which utilizes richer,\nless-loss feature representations and incorporates auxiliary features during\nthe selection process, thereby improving the effectiveness of the feature\nselection mechanism. Additionally, we introduce a novel loss function, the\nSymmetrization Weight Binary Cross-Entropy (SWBCE), which simultaneously\nemphasizes both the recall of edge pixels and the suppression of erroneous edge\npredictions, thereby enhancing the predictions both in the perceptual quality\nand the prediction accuracy. The effectiveness and superiority of our\napproaches over baseline models, the standard E-S framework, and the standard\nWeight Binary Cross-Entropy (WBCE) loss function are demonstrated by extensive\nexperiments. For example, our enhanced E-S architecture trained with SWBCE loss\nfunction achieves average improvements of 8.25$\\%$, 8.01$\\%$, and 33.25$\\%$ in\nODS, OIS, and AP, measured on BIPED2 compared with the baseline models,\nsignificantly outperforming the standard E-S method. The results set new\nbenchmarks for ED tasks, and highlight the potential of the methods in beyond.\n","authors":["Hao Shu"],"pdf_url":"https://arxiv.org/pdf/2501.13365v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2501.13357v1","updated":"2025-01-23T03:50:51Z","published":"2025-01-23T03:50:51Z","title":"A light-weight model to generate NDWI from Sentinel-1","summary":" The use of Sentinel-2 images to compute Normalized Difference Water Index\n(NDWI) has many applications, including water body area detection. However,\ncloud cover poses significant challenges in this regard, which hampers the\neffectiveness of Sentinel-2 images in this context. In this paper, we present a\ndeep learning model that can generate NDWI given Sentinel-1 images, thereby\novercoming this cloud barrier. We show the effectiveness of our model, where it\ndemonstrates a high accuracy of 0.9134 and an AUC of 0.8656 to predict the\nNDWI. Additionally, we observe promising results with an R2 score of 0.4984\n(for regressing the NDWI values) and a Mean IoU of 0.4139 (for the underlying\nsegmentation task). In conclusion, our model offers a first and robust solution\nfor generating NDWI images directly from Sentinel-1 images and subsequent use\nfor various applications even under challenging conditions such as cloud cover\nand nighttime.\n","authors":["Saleh Sakib Ahmed","Saifur Rahman Jony","Md. Toufikuzzaman","Saifullah Sayed","Rashed Uz Zzaman","Sara Nowreen","M. Sohel Rahman"],"pdf_url":"https://arxiv.org/pdf/2501.13357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13354v1","updated":"2025-01-23T03:42:22Z","published":"2025-01-23T03:42:22Z","title":"NUDT4MSTAR: A New Dataset and Benchmark Towards SAR Target Recognition\n in the Wild","summary":" Synthetic Aperture Radar (SAR) stands as an indispensable sensor for Earth\nobservation, owing to its unique capability for all-day imaging. Nevertheless,\nin a data-driven era, the scarcity of large-scale datasets poses a significant\nbottleneck to advancing SAR automatic target recognition (ATR) technology. This\npaper introduces NUDT4MSTAR, a large-scale SAR dataset for vehicle target\nrecognition in the wild, including 40 target types and a wide array of imaging\nconditions across 5 different scenes. NUDT4MSTAR represents a significant leap\nforward in dataset scale, containing over 190,000 images-tenfold the size of\nits predecessors. To enhance the utility of this dataset, we meticulously\nannotate each image with detailed target information and imaging conditions. We\nalso provide data in both processed magnitude images and original complex\nformats. Then, we construct a comprehensive benchmark consisting of 7\nexperiments with 15 recognition methods focusing on the stable and effective\nATR issues. Besides, we conduct transfer learning experiments utilizing various\nmodels trained on NUDT4MSTAR and applied to three other target datasets,\nthereby demonstrating its substantial potential to the broader field of ground\nobjects ATR. Finally, we discuss this dataset's application value and ATR's\nsignificant challenges. To the best of our knowledge, this work marks the\nfirst-ever endeavor to create a large-scale dataset benchmark for fine-grained\nSAR recognition in the wild, featuring an extensive collection of exhaustively\nannotated vehicle images. We expect that the open source of NUDT4MSTAR will\nfacilitate the development of SAR ATR and attract a wider community of\nresearchers.\n","authors":["Yongxiang Liu","Weijie Li","Li Liu","Jie Zhou","Xuying Xiong","Bowen Peng","Yafei Song","Wei Yang","Tianpeng Liu","Zhen Liu","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2501.13354v1.pdf","comment":"18 pages, 15 figures; link:\n https://github.com/waterdisappear/NUDT4MSTAR"},{"id":"http://arxiv.org/abs/2501.13353v1","updated":"2025-01-23T03:34:14Z","published":"2025-01-23T03:34:14Z","title":"Contrast: A Hybrid Architecture of Transformers and State Space Models\n for Low-Level Vision","summary":" Transformers have become increasingly popular for image super-resolution (SR)\ntasks due to their strong global context modeling capabilities. However, their\nquadratic computational complexity necessitates the use of window-based\nattention mechanisms, which restricts the receptive field and limits effective\ncontext expansion. Recently, the Mamba architecture has emerged as a promising\nalternative with linear computational complexity, allowing it to avoid window\nmechanisms and maintain a large receptive field. Nevertheless, Mamba faces\nchallenges in handling long-context dependencies when high pixel-level\nprecision is required, as in SR tasks. This is due to its hidden state\nmechanism, which can compress and store a substantial amount of context but\nonly in an approximate manner, leading to inaccuracies that transformers do not\nsuffer from. In this paper, we propose \\textbf{Contrast}, a hybrid SR model\nthat combines \\textbf{Con}volutional, \\textbf{Tra}nsformer, and \\textbf{St}ate\nSpace components, effectively blending the strengths of transformers and Mamba\nto address their individual limitations. By integrating transformer and state\nspace mechanisms, \\textbf{Contrast} compensates for the shortcomings of each\napproach, enhancing both global context modeling and pixel-level accuracy. We\ndemonstrate that combining these two architectures allows us to mitigate the\nproblems inherent in each, resulting in improved performance on image\nsuper-resolution tasks.\n","authors":["Aman Urumbekov","Zheng Chen"],"pdf_url":"https://arxiv.org/pdf/2501.13353v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.13352v1","updated":"2025-01-23T03:32:52Z","published":"2025-01-23T03:32:52Z","title":"Polyhedra Encoding Transformers: Enhancing Diffusion MRI Analysis Beyond\n Voxel and Volumetric Embedding","summary":" Diffusion-weighted Magnetic Resonance Imaging (dMRI) is an essential tool in\nneuroimaging. It is arguably the sole noninvasive technique for examining the\nmicrostructural properties and structural connectivity of the brain. Recent\nyears have seen the emergence of machine learning and data-driven approaches\nthat enhance the speed, accuracy, and consistency of dMRI data analysis.\nHowever, traditional deep learning models often fell short, as they typically\nutilize pixel-level or volumetric patch-level embeddings similar to those used\nin structural MRI, and do not account for the unique distribution of various\ngradient encodings. In this paper, we propose a novel method called Polyhedra\nEncoding Transformer (PE-Transformer) for dMRI, designed specifically to handle\nspherical signals. Our approach involves projecting an icosahedral polygon onto\na unit sphere to resample signals from predetermined directions. These\nresampled signals are then transformed into embeddings, which are processed by\na transformer encoder that incorporates orientational information reflective of\nthe icosahedral structure. Through experimental validation with various\ngradient encoding protocols, our method demonstrates superior accuracy in\nestimating multi-compartment models and Fiber Orientation Distributions (FOD),\noutperforming both conventional CNN architectures and standard transformers.\n","authors":["Tianyuan Yao","Zhiyuan Li","Praitayini Kanakaraj","Derek B. Archer","Kurt Schilling","Lori Beason-Held","Susan Resnick","Bennett A. Landman","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2501.13352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15368v2","updated":"2025-01-23T03:24:01Z","published":"2023-11-26T17:48:48Z","title":"Flow-Guided Diffusion for Video Inpainting","summary":" Video inpainting has been challenged by complex scenarios like large\nmovements and low-light conditions. Current methods, including emerging\ndiffusion models, face limitations in quality and efficiency. This paper\nintroduces the Flow-Guided Diffusion model for Video Inpainting (FGDVI), a\nnovel approach that significantly enhances temporal consistency and inpainting\nquality via reusing an off-the-shelf image generation diffusion model. We\nemploy optical flow for precise one-step latent propagation and introduces a\nmodel-agnostic flow-guided latent interpolation technique. This technique\nexpedites denoising, seamlessly integrating with any Video Diffusion Model\n(VDM) without additional training. Our FGDVI demonstrates a remarkable 10%\nimprovement in flow warping error E_warp over existing state-of-the-art\nmethods. Our comprehensive experiments validate superior performance of FGDVI,\noffering a promising direction for advanced video inpainting. The code and\ndetailed results will be publicly available in\nhttps://github.com/NevSNev/FGDVI.\n","authors":["Bohai Gu","Yongsheng Yu","Heng Fan","Libo Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15368v2.pdf","comment":"This paper has been withdrawn as a new iteration of the work has been\n developed, which includes significant improvements and refinements based on\n this submission. The withdrawal is made to ensure academic integrity and\n compliance with publication standards. If you are interested, please refer to\n the updated work at arXiv:2412.00857"},{"id":"http://arxiv.org/abs/2501.13349v1","updated":"2025-01-23T03:18:23Z","published":"2025-01-23T03:18:23Z","title":"MSF: Efficient Diffusion Model Via Multi-Scale Latent Factorize","summary":" Diffusion-based generative models have achieved remarkable progress in visual\ncontent generation. However, traditional diffusion models directly denoise the\nentire image from noisy inputs, disregarding the hierarchical structure present\nin visual signals. This method is computationally intensive, especially for\nhigh-resolution image generation. Signal processing often leverages\nhierarchical decompositions; for instance, Fourier analysis decomposes signals\nby frequency, while wavelet analysis captures localized frequency components,\nreflecting both spatial and frequency information simultaneously. Inspired by\nthese principles, we propose a multiscale diffusion framework that generates\nhierarchical visual representations, which are subsequently integrated to form\nthe final output. The diffusion model target, whether raw RGB pixels or latent\nfeatures from a Variational Autoencoder, s divided into multiple components\nthat each capture distinct spatial levels. The low-resolution component\ncontains the primary informative signal, while higher-resolution components add\nhigh-frequency details, such as texture. This approach divides image generation\ninto two stages: producing a low-resolution base signal, followed by a\nhigh-resolution residual signal. Both stages can be effectively modeled using\nsimpler, lightweight transformer architectures compared to full-resolution\ngeneration. This decomposition is conceptually similar to wavelet decomposition\nbut offers a more streamlined and intuitive design. Our method, termed\nMSF(short for Multi-Scale Factorization), achieves an FID of 2.2 and an IS of\n255.4 on the ImageNet 256x256 benchmark, reducing computational costs by 50%\ncompared to baseline methods.\n","authors":["Haohang Xu","Longyu Chen","Shuangrui Ding","Yilin Gao","Dongsheng Jiang","Yin Li","Shugong Xu","Junqing Yu","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2501.13349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13343v1","updated":"2025-01-23T03:04:30Z","published":"2025-01-23T03:04:30Z","title":"YOLOSCM: An improved YOLO algorithm for cars detection","summary":" Detecting objects in urban traffic images presents considerable difficulties\nbecause of the following reasons: 1) These images are typically immense in\nsize, encompassing millions or even hundreds of millions of pixels, yet\ncomputational resources are constrained. 2) The small size of vehicles in\ncertain scenarios leads to insufficient information for accurate detection. 3)\nThe uneven distribution of vehicles causes inefficient use of computational\nresources. To address these issues, we propose YOLOSCM (You Only Look Once with\nSegmentation Clustering Module), an efficient and effective framework. To\naddress the challenges of large-scale images and the non-uniform distribution\nof vehicles, we propose a Segmentation Clustering Module (SCM). This module\nadaptively identifies clustered regions, enabling the model to focus on these\nareas for more precise detection. Additionally, we propose a new training\nstrategy to optimize the detection of small vehicles and densely packed targets\nin complex urban traffic scenes. We perform extensive experiments on urban\ntraffic datasets to demonstrate the effectiveness and superiority of our\nproposed approach.\n","authors":["Changhui Deng","Lieyang Chen","Shinan Liu"],"pdf_url":"https://arxiv.org/pdf/2501.13343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13341v1","updated":"2025-01-23T02:45:35Z","published":"2025-01-23T02:45:35Z","title":"Multi-aspect Knowledge Distillation with Large Language Model","summary":" Recent advancements in deep learning have significantly improved performance\non computer vision tasks. Previous image classification methods primarily\nmodify model architectures or add features, and they optimize models using\ncross-entropy loss on class logits. Since they focus on classifying images with\nconsidering class labels, these methods may struggle to learn various\n\\emph{aspects} of classes (e.g., natural positions and shape changes).\nRethinking the previous approach from a novel view, we propose a multi-aspect\nknowledge distillation method using Multimodal Large Language Models (MLLMs).\nOur approach involves: 1) querying Large Language Model with multi-aspect\nquestions relevant to the knowledge we want to transfer to the model, 2)\nextracting corresponding logits from MLLM, and 3) expanding the model's output\ndimensions to distill these multi-aspect logits. We then apply cross-entropy\nloss to class logits and binary cross-entropy loss to multi-aspect logits.\nThrough our method, the model can learn not only the knowledge about visual\naspects but also the abstract and complex aspects that require a deeper\nunderstanding. We primarily apply our method to image classification, and to\nexplore the potential for extending our model, we expand it to other tasks,\nsuch as object detection. In all experimental results, our method improves the\nperformance of the baselines. Additionally, we analyze the effect of\nmulti-aspect knowledge distillation. These results demonstrate that our method\ncan transfer knowledge about various aspects to the model and the aspect\nknowledge can enhance model performance in computer vision tasks. This paper\ndemonstrates the great potential of multi-aspect knowledge distillation, and we\nbelieve it offers a promising direction for future research in computer vision\nand beyond.\n","authors":["Taegyeong Lee","Jinsik Bang","Soyeong Kwon","Taehwan Kim"],"pdf_url":"https://arxiv.org/pdf/2501.13341v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2501.13340v1","updated":"2025-01-23T02:42:28Z","published":"2025-01-23T02:42:28Z","title":"Retrievals Can Be Detrimental: A Contrastive Backdoor Attack Paradigm on\n Retrieval-Augmented Diffusion Models","summary":" Diffusion models (DMs) have recently demonstrated remarkable generation\ncapability. However, their training generally requires huge computational\nresources and large-scale datasets. To solve these, recent studies empower DMs\nwith the advanced Retrieval-Augmented Generation (RAG) technique and propose\nretrieval-augmented diffusion models (RDMs). By incorporating rich knowledge\nfrom an auxiliary database, RAG enhances diffusion models' generation and\ngeneralization ability while significantly reducing model parameters. Despite\nthe great success, RAG may introduce novel security issues that warrant further\ninvestigation. In this paper, we reveal that the RDM is susceptible to backdoor\nattacks by proposing a multimodal contrastive attack approach named BadRDM. Our\nframework fully considers RAG's characteristics and is devised to manipulate\nthe retrieved items for given text triggers, thereby further controlling the\ngenerated contents. Specifically, we first insert a tiny portion of images into\nthe retrieval database as target toxicity surrogates. Subsequently, a malicious\nvariant of contrastive learning is adopted to inject backdoors into the\nretriever, which builds shortcuts from triggers to the toxicity surrogates.\nFurthermore, we enhance the attacks through novel entropy-based selection and\ngenerative augmentation strategies that can derive better toxicity surrogates.\nExtensive experiments on two mainstream tasks demonstrate the proposed BadRDM\nachieves outstanding attack effects while preserving the model's benign\nutility.\n","authors":["Hao Fang","Xiaohang Sui","Hongyao Yu","Jiawei Kong","Sijin Yu","Bin Chen","Hao Wu","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2501.13340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13338v1","updated":"2025-01-23T02:39:04Z","published":"2025-01-23T02:39:04Z","title":"CuriousBot: Interactive Mobile Exploration via Actionable 3D Relational\n Object Graph","summary":" Mobile exploration is a longstanding challenge in robotics, yet current\nmethods primarily focus on active perception instead of active interaction,\nlimiting the robot's ability to interact with and fully explore its\nenvironment. Existing robotic exploration approaches via active interaction are\noften restricted to tabletop scenes, neglecting the unique challenges posed by\nmobile exploration, such as large exploration spaces, complex action spaces,\nand diverse object relations. In this work, we introduce a 3D relational object\ngraph that encodes diverse object relations and enables exploration through\nactive interaction. We develop a system based on this representation and\nevaluate it across diverse scenes. Our qualitative and quantitative results\ndemonstrate the system's effectiveness and generalization capabilities,\noutperforming methods that rely solely on vision-language models (VLMs).\n","authors":["Yixuan Wang","Leonor Fermoselle","Tarik Kelestemur","Jiuguang Wang","Yunzhu Li"],"pdf_url":"https://arxiv.org/pdf/2501.13338v1.pdf","comment":"Project Page: https://curiousbot.theaiinstitute.com/"},{"id":"http://arxiv.org/abs/2411.02441v4","updated":"2025-01-23T02:35:58Z","published":"2024-11-02T13:03:44Z","title":"Cross-D Conv: Cross-Dimensional Transferable Knowledge Base via Fourier\n Shifting Operation","summary":" In biomedical imaging analysis, the dichotomy between 2D and 3D data presents\na significant challenge. While 3D volumes offer superior real-world\napplicability, they are less available for each modality and not easy to train\nin large scale, whereas 2D samples are abundant but less comprehensive. This\npaper introduces Cross-D Conv operation, a novel approach that bridges the\ndimensional gap by learning the phase shifting in the Fourier domain. Our\nmethod enables seamless weight transfer between 2D and 3D convolution\noperations, effectively facilitating cross-dimensional learning. The proposed\narchitecture leverages the abundance of 2D training data to enhance 3D model\nperformance, offering a practical solution to the multimodal data scarcity\nchallenge in 3D medical model pretraining. Experimental validation on the\nRadImagenet (2D) and multimodal volumetric sets demonstrates that our approach\nachieves comparable or superior performance in feature quality assessment. The\nenhanced convolution operation presents new opportunities for developing\nefficient classification and segmentation models in medical imaging. This work\nrepresents an advancement in cross-dimensional and multimodal medical image\nanalysis, offering a robust framework for utilizing 2D priors in 3D model\npretraining while maintaining computational efficiency of 2D training.\n","authors":["Mehmet Can Yavuz","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2411.02441v4.pdf","comment":"Accepted for ISBI25; Codes&Weights:\n https://github.com/convergedmachine/Cross-D-Conv"},{"id":"http://arxiv.org/abs/2501.13336v1","updated":"2025-01-23T02:34:14Z","published":"2025-01-23T02:34:14Z","title":"Gradient-Free Adversarial Purification with Diffusion Models","summary":" Adversarial training and adversarial purification are two effective and\npractical defense methods to enhance a model's robustness against adversarial\nattacks. However, adversarial training necessitates additional training, while\nadversarial purification suffers from low time efficiency. More critically,\ncurrent defenses are designed under the perturbation-based adversarial threat\nmodel, which is ineffective against the recently proposed unrestricted\nadversarial attacks. In this paper, we propose an effective and efficient\nadversarial defense method that counters both perturbation-based and\nunrestricted adversarial attacks. Our defense is inspired by the observation\nthat adversarial attacks are typically located near the decision boundary and\nare sensitive to pixel changes. To address this, we introduce adversarial\nanti-aliasing to mitigate adversarial modifications. Additionally, we propose\nadversarial super-resolution, which leverages prior knowledge from clean\ndatasets to benignly recover images. These approaches do not require additional\ntraining and are computationally efficient without calculating gradients.\nExtensive experiments against both perturbation-based and unrestricted\nadversarial attacks demonstrate that our defense method outperforms\nstate-of-the-art adversarial purification methods.\n","authors":["Xuelong Dai","Dong Wang","Duan Mingxing","Bin Xiao"],"pdf_url":"https://arxiv.org/pdf/2501.13336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13335v1","updated":"2025-01-23T02:31:57Z","published":"2025-01-23T02:31:57Z","title":"Deblur-Avatar: Animatable Avatars from Motion-Blurred Monocular Videos","summary":" We introduce Deblur-Avatar, a novel framework for modeling high-fidelity,\nanimatable 3D human avatars from motion-blurred monocular video inputs. Motion\nblur is prevalent in real-world dynamic video capture, especially due to human\nmovements in 3D human avatar modeling. Existing methods either (1) assume sharp\nimage inputs, failing to address the detail loss introduced by motion blur, or\n(2) mainly consider blur by camera movements, neglecting the human motion blur\nwhich is more common in animatable avatars. Our proposed approach integrates a\nhuman movement-based motion blur model into 3D Gaussian Splatting (3DGS). By\nexplicitly modeling human motion trajectories during exposure time, we jointly\noptimize the trajectories and 3D Gaussians to reconstruct sharp, high-quality\nhuman avatars. We employ a pose-dependent fusion mechanism to distinguish\nmoving body regions, optimizing both blurred and sharp areas effectively.\nExtensive experiments on synthetic and real-world datasets demonstrate that\nDeblur-Avatar significantly outperforms existing methods in rendering quality\nand quantitative metrics, producing sharp avatar reconstructions and enabling\nreal-time rendering under challenging motion blur conditions.\n","authors":["Xianrui Luo","Juewen Peng","Zhongang Cai","Lei Yang","Fan Yang","Zhiguo Cao","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2501.13335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10074v3","updated":"2025-01-23T02:31:25Z","published":"2025-01-17T09:46:27Z","title":"SpatialCoT: Advancing Spatial Reasoning through Coordinate Alignment and\n Chain-of-Thought for Embodied Task Planning","summary":" Spatial reasoning is an essential problem in embodied AI research. Efforts to\nenhance spatial reasoning abilities through supplementary spatial data and\nfine-tuning have proven limited and ineffective when addressing complex\nembodied tasks, largely due to their dependence on language-based outputs.\nWhile some approaches have introduced a point-based action space to mitigate\nthis issue, they fall short in managing more intricate tasks within complex\nenvironments. This deficiency arises from their failure to fully exploit the\ninherent thinking and reasoning capabilities that are fundamental strengths of\nVision-Language Models (VLMs). To address these limitations, we propose a novel\napproach named SpatialCoT, specifically designed to bolster the spatial\nreasoning capabilities of VLMs. Our approach comprises two stages: spatial\ncoordinate bi-directional alignment, which aligns vision-language inputs with\nspatial coordinates, and chain-of-thought spatial grounding, which harnesses\nthe reasoning capabilities of language models for advanced spatial reasoning.\nWe evaluate SpatialCoT on challenging navigation and manipulation tasks, both\nin simulation and real-world settings. Experimental results demonstrate that\nour method significantly outperforms previous state-of-the-art approaches in\nboth tasks.\n","authors":["Yuecheng Liu","Dafeng Chi","Shiguang Wu","Zhanguang Zhang","Yaochen Hu","Lingfeng Zhang","Yingxue Zhang","Shuang Wu","Tongtong Cao","Guowei Huang","Helong Huang","Guangjian Tian","Weichao Qiu","Xingyue Quan","Jianye Hao","Yuzheng Zhuang"],"pdf_url":"https://arxiv.org/pdf/2501.10074v3.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2501.08819v3","updated":"2025-01-23T02:12:03Z","published":"2025-01-15T14:17:13Z","title":"Boosting Diffusion Guidance via Learning Degradation-Aware Models for\n Blind Super Resolution","summary":" Recently, diffusion-based blind super-resolution (SR) methods have shown\ngreat ability to generate high-resolution images with abundant high-frequency\ndetail, but the detail is often achieved at the expense of fidelity. Meanwhile,\nanother line of research focusing on rectifying the reverse process of\ndiffusion models (i.e., diffusion guidance), has demonstrated the power to\ngenerate high-fidelity results for non-blind SR. However, these methods rely on\nknown degradation kernels, making them difficult to apply to blind SR. To\naddress these issues, we present DADiff in this paper. DADiff incorporates\ndegradation-aware models into the diffusion guidance framework, eliminating the\nneed to know degradation kernels. Additionally, we propose two novel\ntechniques: input perturbation and guidance scalar, to further improve our\nperformance. Extensive experimental results show that our proposed method has\nsuperior performance over state-of-the-art methods on blind SR benchmarks.\n","authors":["Shao-Hao Lu","Ren Wang","Ching-Chun Huang","Wei-Chen Chiu"],"pdf_url":"https://arxiv.org/pdf/2501.08819v3.pdf","comment":"To appear in WACV 2025. Code is available at:\n https://github.com/ryanlu2240/DADiff"},{"id":"http://arxiv.org/abs/2404.12385v2","updated":"2025-01-23T01:55:24Z","published":"2024-04-18T17:59:41Z","title":"MeshLRM: Large Reconstruction Model for High-Quality Meshes","summary":" We propose MeshLRM, a novel LRM-based approach that can reconstruct a\nhigh-quality mesh from merely four input images in less than one second.\nDifferent from previous large reconstruction models (LRMs) that focus on\nNeRF-based reconstruction, MeshLRM incorporates differentiable mesh extraction\nand rendering within the LRM framework. This allows for end-to-end mesh\nreconstruction by fine-tuning a pre-trained NeRF LRM with mesh rendering.\nMoreover, we improve the LRM architecture by simplifying several complex\ndesigns in previous LRMs. MeshLRM's NeRF initialization is sequentially trained\nwith low- and high-resolution images; this new LRM training strategy enables\nsignificantly faster convergence and thereby leads to better quality with less\ncompute. Our approach achieves state-of-the-art mesh reconstruction from\nsparse-view inputs and also allows for many downstream applications, including\ntext-to-3D and single-image-to-3D generation. Project page:\nhttps://sarahweiii.github.io/meshlrm/\n","authors":["Xinyue Wei","Kai Zhang","Sai Bi","Hao Tan","Fujun Luan","Valentin Deschaintre","Kalyan Sunkavalli","Hao Su","Zexiang Xu"],"pdf_url":"https://arxiv.org/pdf/2404.12385v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07251v2","updated":"2025-01-23T01:40:37Z","published":"2025-01-13T12:00:34Z","title":"MOS-Attack: A Scalable Multi-objective Adversarial Attack Framework","summary":" Crafting adversarial examples is crucial for evaluating and enhancing the\nrobustness of Deep Neural Networks (DNNs), presenting a challenge equivalent to\nmaximizing a non-differentiable 0-1 loss function.\n However, existing single objective methods, namely adversarial attacks focus\non a surrogate loss function, do not fully harness the benefits of engaging\nmultiple loss functions, as a result of insufficient understanding of their\nsynergistic and conflicting nature.\n To overcome these limitations, we propose the Multi-Objective Set-based\nAttack (MOS Attack), a novel adversarial attack framework leveraging multiple\nloss functions and automatically uncovering their interrelations.\n The MOS Attack adopts a set-based multi-objective optimization strategy,\nenabling the incorporation of numerous loss functions without additional\nparameters.\n It also automatically mines synergistic patterns among various losses,\nfacilitating the generation of potent adversarial attacks with fewer\nobjectives.\n Extensive experiments have shown that our MOS Attack outperforms\nsingle-objective attacks. Furthermore, by harnessing the identified synergistic\npatterns, MOS Attack continues to show superior results with a reduced number\nof loss functions.\n","authors":["Ping Guo","Cheng Gong","Xi Lin","Fei Liu","Zhichao Lu","Qingfu Zhang","Zhenkun Wang"],"pdf_url":"https://arxiv.org/pdf/2501.07251v2.pdf","comment":"Under Review of CVPR 2025"},{"id":"http://arxiv.org/abs/2501.13307v1","updated":"2025-01-23T01:28:05Z","published":"2025-01-23T01:28:05Z","title":"From Cross-Modal to Mixed-Modal Visible-Infrared Re-Identification","summary":" Visible-infrared person re-identification (VI-ReID) aims to match individuals\nacross different camera modalities, a critical task in modern surveillance\nsystems. While current VI-ReID methods focus on cross-modality matching,\nreal-world applications often involve mixed galleries containing both V and I\nimages, where state-of-the-art methods show significant performance limitations\ndue to large domain shifts and low discrimination across mixed modalities. This\nis because gallery images from the same modality may have lower domain gaps but\ncorrespond to different identities. This paper introduces a novel mixed-modal\nReID setting, where galleries contain data from both modalities. To address the\ndomain shift among inter-modal and low discrimination capacity in intra-modal\nmatching, we propose the Mixed Modality-Erased and -Related (MixER) method. The\nMixER learning approach disentangles modality-specific and modality-shared\nidentity information through orthogonal decomposition, modality-confusion, and\nID-modality-related objectives. MixER enhances feature robustness across\nmodalities, improving cross-modal and mixed-modal settings performance. Our\nextensive experiments on the SYSU-MM01, RegDB and LLMC datasets indicate that\nour approach can provide state-of-the-art results using a single backbone, and\nshowcase the flexibility of our approach in mixed gallery applications.\n","authors":["Mahdi Alehdaghi","Rajarshi Bhattacharya","Pourya Shamsolmoali","Rafael M. O. Cruz","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2501.13307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08331v3","updated":"2025-01-23T01:17:11Z","published":"2025-01-14T18:59:10Z","title":"Go-with-the-Flow: Motion-Controllable Video Diffusion Models Using\n Real-Time Warped Noise","summary":" Generative modeling aims to transform random noise into structured outputs.\nIn this work, we enhance video diffusion models by allowing motion control via\nstructured latent noise sampling. This is achieved by just a change in data: we\npre-process training videos to yield structured noise. Consequently, our method\nis agnostic to diffusion model design, requiring no changes to model\narchitectures or training pipelines. Specifically, we propose a novel noise\nwarping algorithm, fast enough to run in real time, that replaces random\ntemporal Gaussianity with correlated warped noise derived from optical flow\nfields, while preserving the spatial Gaussianity. The efficiency of our\nalgorithm enables us to fine-tune modern video diffusion base models using\nwarped noise with minimal overhead, and provide a one-stop solution for a wide\nrange of user-friendly motion control: local object motion control, global\ncamera movement control, and motion transfer. The harmonization between\ntemporal coherence and spatial Gaussianity in our warped noise leads to\neffective motion control while maintaining per-frame pixel quality. Extensive\nexperiments and user studies demonstrate the advantages of our method, making\nit a robust and scalable approach for controlling motion in video diffusion\nmodels. Video results are available on our webpage:\nhttps://eyeline-research.github.io/Go-with-the-Flow. Source code and model\ncheckpoints are available on GitHub:\nhttps://github.com/Eyeline-Research/Go-with-the-Flow.\n","authors":["Ryan Burgert","Yuancheng Xu","Wenqi Xian","Oliver Pilarski","Pascal Clausen","Mingming He","Li Ma","Yitong Deng","Lingxiao Li","Mohsen Mousavi","Michael Ryoo","Paul Debevec","Ning Yu"],"pdf_url":"https://arxiv.org/pdf/2501.08331v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14876v2","updated":"2025-01-23T01:08:19Z","published":"2024-04-02T01:42:32Z","title":"Precise and Robust Sidewalk Detection: Leveraging Ensemble Learning to\n Surpass LLM Limitations in Urban Environments","summary":" This study aims to compare the effectiveness of a robust ensemble model with\nthe state-of-the-art ONE-PEACE Large Language Model (LLM) for accurate\ndetection of sidewalks. Accurate sidewalk detection is crucial in improving\nroad safety and urban planning. The study evaluated the model's performance on\nCityscapes, Ade20k, and the Boston Dataset. The results showed that the\nensemble model performed better than the individual models, achieving mean\nIntersection Over Union (mIOU) scores of 93.1\\%, 90.3\\%, and 90.6\\% on these\ndatasets under ideal conditions. Additionally, the ensemble model maintained a\nconsistent level of performance even in challenging conditions such as\nSalt-and-Pepper and Speckle noise, with only a gradual decrease in efficiency\nobserved. On the other hand, the ONE-PEACE LLM performed slightly better than\nthe ensemble model in ideal scenarios but experienced a significant decline in\nperformance under noisy conditions. These findings demonstrate the robustness\nand reliability of the ensemble model, making it a valuable asset for improving\nurban infrastructure related to road safety and curb space management. This\nstudy contributes positively to the broader context of urban health and\nmobility.\n","authors":["Ibne Farabi Shihab","Sudesh Ramesh Bhagat","Anuj Sharma"],"pdf_url":"https://arxiv.org/pdf/2405.14876v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10011v2","updated":"2025-01-23T00:32:49Z","published":"2024-07-13T21:35:13Z","title":"Sim-to-Real Domain Adaptation for Deformation Classification","summary":" Deformation detection is vital for enabling accurate assessment and\nprediction of structural changes in materials, ensuring timely and effective\ninterventions to maintain safety and integrity. Automating deformation\ndetection through computer vision is crucial for efficient monitoring, but it\nfaces significant challenges in creating a comprehensive dataset of both\ndeformed and non-deformed objects, which can be difficult to obtain in many\nscenarios. In this paper, we introduce a novel framework for generating\ncontrolled synthetic data that simulates deformed objects. This approach allows\nfor the realistic modeling of object deformations under various conditions. Our\nframework integrates an intelligent adapter network that facilitates\nsim-to-real domain adaptation, enhancing classification results without\nrequiring real data from deformed objects. We conduct experiments on domain\nadaptation and classification tasks and demonstrate that our framework improves\nsim-to-real classification results compared to simulation baseline.\n","authors":["Joel Sol","Jamil Fayyad","Shadi Alijani","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2407.10011v2.pdf","comment":"7 pages, 5 figures, submitted to SMC"},{"id":"http://arxiv.org/abs/2404.14657v2","updated":"2025-01-23T00:01:50Z","published":"2024-04-23T01:34:20Z","title":"Progressive Token Length Scaling in Transformer Encoders for Efficient\n Universal Segmentation","summary":" A powerful architecture for universal segmentation relies on transformers\nthat encode multi-scale image features and decode object queries into mask\npredictions. With efficiency being a high priority for scaling such models, we\nobserved that the state-of-the-art method Mask2Former uses 50% of its compute\nonly on the transformer encoder. This is due to the retention of a full-length\ntoken-level representation of all backbone feature scales at each encoder\nlayer. With this observation, we propose a strategy termed PROgressive Token\nLength SCALing for Efficient transformer encoders (PRO-SCALE) that can be\nplugged-in to the Mask2Former segmentation architecture to significantly reduce\nthe computational cost. The underlying principle of PRO-SCALE is: progressively\nscale the length of the tokens with the layers of the encoder. This allows\nPRO-SCALE to reduce computations by a large margin with minimal sacrifice in\nperformance (~52% encoder and ~27% overall GFLOPs reduction with no drop in\nperformance on COCO dataset). Experiments conducted on public benchmarks\ndemonstrates PRO-SCALE's flexibility in architectural configurations, and\nexhibits potential for extension beyond the settings of segmentation tasks to\nencompass object detection. Code here:\nhttps://github.com/abhishekaich27/proscale-pytorch\n","authors":["Abhishek Aich","Yumin Suh","Samuel Schulter","Manmohan Chandraker"],"pdf_url":"https://arxiv.org/pdf/2404.14657v2.pdf","comment":"Accepted to ICLR 2025"},{"id":"http://arxiv.org/abs/2501.14122v1","updated":"2025-01-23T22:36:06Z","published":"2025-01-23T22:36:06Z","title":"Reinforcement Learning Platform for Adversarial Black-box Attacks with\n Custom Distortion Filters","summary":" We present a Reinforcement Learning Platform for Adversarial Black-box\nuntargeted and targeted attacks, RLAB, that allows users to select from various\ndistortion filters to create adversarial examples. The platform uses a\nReinforcement Learning agent to add minimum distortion to input images while\nstill causing misclassification by the target model. The agent uses a novel\ndual-action method to explore the input image at each step to identify\nsensitive regions for adding distortions while removing noises that have less\nimpact on the target model. This dual action leads to faster and more efficient\nconvergence of the attack. The platform can also be used to measure the\nrobustness of image classification models against specific distortion types.\nAlso, retraining the model with adversarial samples significantly improved\nrobustness when evaluated on benchmark datasets. The proposed platform\noutperforms state-of-the-art methods in terms of the average number of queries\nrequired to cause misclassification. This advances trustworthiness with a\npositive social impact.\n","authors":["Soumyendu Sarkar","Ashwin Ramesh Babu","Sajad Mousavi","Vineet Gundecha","Sahand Ghorbanpour","Avisek Naug","Ricardo Luna Gutierrez","Antonio Guillen"],"pdf_url":"https://arxiv.org/pdf/2501.14122v1.pdf","comment":"Under Review for 2025 AAAI Conference on Artificial Intelligence\n Proceedings"},{"id":"http://arxiv.org/abs/2309.12397v2","updated":"2025-01-23T22:17:17Z","published":"2023-09-21T18:00:34Z","title":"POLAR-Sim: Augmenting NASA's POLAR Dataset for Data-Driven Lunar\n Perception and Rover Simulation","summary":" NASA's POLAR dataset contains approximately 2,600 pairs of high dynamic range\nstereo photos captured across 13 varied terrain scenarios, including areas with\nsparse or dense rock distributions, craters, and rocks of different sizes. The\npurpose of these photos is to spur development in robotics, AI-based\nperception, and autonomous navigation. Acknowledging a scarcity of lunar images\nfrom around the lunar poles, NASA Ames produced on Earth but in controlled\nconditions images that resemble rover operating conditions from these regions\nof the Moon. We report on the outcomes of an effort aimed at accomplishing two\ntasks. In Task 1, we provided bounding boxes and semantic segmentation\ninformation for all the images in NASA's POLAR dataset. This effort resulted in\n23,000 labels and semantic segmentation annotations pertaining to rocks,\nshadows, and craters. In Task 2, we generated the digital twins of the 13\nscenarios that have been used to produce all the photos in the POLAR dataset.\nSpecifically, for each of these scenarios, we produced individual meshes,\ntexture information, and material properties associated with the ground and the\nrocks in each scenario. This allows anyone with a camera model to synthesize\nimages associated with any of the 13 scenarios of the POLAR dataset.\nEffectively, one can generate as many semantically labeled synthetic images as\ndesired -- with different locations and exposure values in the scene, for\ndifferent positions of the sun, with or without the presence of active\nillumination, etc. The benefit of this work is twofold. Using outcomes of Task\n1, one can train and/or test perception algorithms that deal with Moon images.\nFor Task 2, one can produce as much data as desired to train and test AI\nalgorithms that are anticipated to work in lunar conditions. All the outcomes\nof this work are available in a public repository for unfettered use and\ndistribution.\n","authors":["Bo-Hsun Chen","Peter Negrut","Thomas Liang","Nevindu Batagoda","Harry Zhang","Dan Negrut"],"pdf_url":"https://arxiv.org/pdf/2309.12397v2.pdf","comment":"11 pages, 9 figures. This work has been submitted to the IEEE for\n possible publication"},{"id":"http://arxiv.org/abs/2501.14101v1","updated":"2025-01-23T21:20:10Z","published":"2025-01-23T21:20:10Z","title":"StreamingRAG: Real-time Contextual Retrieval and Generation Framework","summary":" Extracting real-time insights from multi-modal data streams from various\ndomains such as healthcare, intelligent transportation, and satellite remote\nsensing remains a challenge. High computational demands and limited knowledge\nscope restrict the applicability of Multi-Modal Large Language Models (MM-LLMs)\non these data streams. Traditional Retrieval-Augmented Generation (RAG) systems\naddress knowledge limitations of these models, but suffer from slow\npreprocessing, making them unsuitable for real-time analysis. We propose\nStreamingRAG, a novel RAG framework designed for streaming data. StreamingRAG\nconstructs evolving knowledge graphs capturing scene-object-entity\nrelationships in real-time. The knowledge graph achieves temporal-aware scene\nrepresentations using MM-LLMs and enables timely responses for specific events\nor user queries. StreamingRAG addresses limitations in existing methods,\nachieving significant improvements in real-time analysis (5-6x faster\nthroughput), contextual accuracy (through a temporal knowledge graph), and\nreduced resource consumption (using lightweight models by 2-3x).\n","authors":["Murugan Sankaradas","Ravi K. Rajendran","Srimat T. Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2501.14101v1.pdf","comment":"Accepted and Presented at AI4Sys, HPDC 2024"},{"id":"http://arxiv.org/abs/2410.13147v8","updated":"2025-01-23T20:19:40Z","published":"2024-10-17T02:04:57Z","title":"Utilizing Large Language Models in an iterative paradigm with domain\n feedback for zero-shot molecule optimization","summary":" Molecule optimization is a critical task in drug discovery to optimize\ndesired properties of a given molecule. Despite Large Language Models (LLMs)\nholding the potential to efficiently simulate this task by using natural\nlanguage to direct the optimization, straightforwardly utilizing them shows\nlimited performance. In this work, we facilitate utilizing LLMs in an iterative\nparadigm by proposing a simple yet effective domain feedback provider, namely\n$\\text{Re}^2$DF. In detail, $\\text{Re}^2$DF harnesses an external toolkit,\nRDKit, to handle the molecule hallucination, if the modified molecule is\nchemically invalid. Otherwise, $\\text{Re}^2$DF verifies whether the modified\nmolecule meets the objective, if not, its desired properties are computed and\ncompared to the original one, establishing reliable domain feedback with\ncorrect direction and distance towards the objective to explicitly guide the\nLLM to refine the modified molecule. We conduct experiments across both single-\nand multi-property objectives with 2 thresholds, where $\\text{Re}^2$DF shows\nsignificant improvements. Notably, for 20 single-property objectives,\n$\\text{Re}^2$DF enhances Hit ratio by 16.96% and 20.76% under loose\n(\\texttt{l}) and strict (\\texttt{s}) thresholds, respectively. For 32\nmulti-property objectives, $\\text{Re}^2$DF enhances Hit ratio by 6.04% and\n5.25%.\n","authors":["Khiem Le","Nitesh V. Chawla"],"pdf_url":"https://arxiv.org/pdf/2410.13147v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14070v1","updated":"2025-01-23T20:12:56Z","published":"2025-01-23T20:12:56Z","title":"Expanding on the BRIAR Dataset: A Comprehensive Whole Body Biometric\n Recognition Resource at Extreme Distances and Real-World Scenarios\n (Collections 1-4)","summary":" The state-of-the-art in biometric recognition algorithms and operational\nsystems has advanced quickly in recent years providing high accuracy and\nrobustness in more challenging collection environments and consumer\napplications. However, the technology still suffers greatly when applied to\nnon-conventional settings such as those seen when performing identification at\nextreme distances or from elevated cameras on buildings or mounted to UAVs.\nThis paper summarizes an extension to the largest dataset currently focused on\naddressing these operational challenges, and describes its composition as well\nas methodologies of collection, curation, and annotation.\n","authors":["Gavin Jager","David Cornett III","Gavin Glenn","Deniz Aykac","Christi Johnson","Robert Zhang","Ryan Shivers","David Bolme","Laura Davies","Scott Dolvin","Nell Barber","Joel Brogan","Nick Burchfield","Carl Dukes","Andrew Duncan","Regina Ferrell","Austin Garrett","Jim Goddard","Jairus Hines","Bart Murphy","Sean Pharris","Brandon Stockwell","Leanne Thompson","Matthew Yohe"],"pdf_url":"https://arxiv.org/pdf/2501.14070v1.pdf","comment":"10 pages, 11 figures, 2 tables, submitted to CVPR"},{"id":"http://arxiv.org/abs/2501.14066v1","updated":"2025-01-23T20:01:33Z","published":"2025-01-23T20:01:33Z","title":"Efficient 2D CT Foundation Model for Contrast Phase Classification","summary":" Purpose: The purpose of this study is to harness the efficiency of a 2D\nfoundation model to develop a robust phase classifier that is resilient to\ndomain shifts.\n Materials and Methods: This retrospective study utilized three public\ndatasets from separate institutions. A 2D foundation model was trained on the\nDeepLesion dataset (mean age: 51.2, s.d.: 17.6; 2398 males) to generate\nembeddings from 2D CT slices for downstream contrast phase classification. The\nclassifier was trained on the VinDr Multiphase dataset and externally validated\non the WAW-TACE dataset. The 2D model was also compared to three 3D supervised\nmodels.\n Results: On the VinDr dataset (146 male, 63 female, 56 unidentified), the\nmodel achieved near-perfect AUROC scores and F1 scores of 99.2%, 94.2%, and\n93.1% for non-contrast, arterial, and venous phases, respectively. The `Other'\ncategory scored lower (F1: 73.4%) due to combining multiple contrast phases\ninto one class. On the WAW-TACE dataset (mean age: 66.1, s.d.: 10.0; 185\nmales), the model showed strong performance with AUROCs of 91.0% and 85.6%, and\nF1 scores of 87.3% and 74.1% for non-contrast and arterial phases. Venous phase\nperformance was lower, with AUROC and F1 scores of 81.7% and 70.2%\nrespectively, due to label mismatches. Compared to 3D supervised models, the\napproach trained faster, performed as well or better, and showed greater\nrobustness to domain shifts.\n Conclusion: The robustness of the 2D Foundation model may be potentially\nuseful for automation of hanging protocols and data orchestration for clinical\ndeployment of AI algorithms.\n","authors":["Benjamin Hou","Tejas Sudharshan Mathai","Pritam Mukherjee","Xinya Wang","Ronald M. Summers","Zhiyong Lub"],"pdf_url":"https://arxiv.org/pdf/2501.14066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14056v1","updated":"2025-01-23T19:43:05Z","published":"2025-01-23T19:43:05Z","title":"Prior Knowledge Injection into Deep Learning Models Predicting Gene\n Expression from Whole Slide Images","summary":" Cancer diagnosis and prognosis primarily depend on clinical parameters such\nas age and tumor grade, and are increasingly complemented by molecular data,\nsuch as gene expression, from tumor sequencing. However, sequencing is costly\nand delays oncology workflows. Recent advances in Deep Learning allow to\npredict molecular information from morphological features within Whole Slide\nImages (WSIs), offering a cost-effective proxy of the molecular markers. While\npromising, current methods lack the robustness to fully replace direct\nsequencing. Here we aim to improve existing methods by introducing a\nmodel-agnostic framework that allows to inject prior knowledge on gene-gene\ninteractions into Deep Learning architectures, thereby increasing accuracy and\nrobustness. We design the framework to be generic and flexibly adaptable to a\nwide range of architectures. In a case study on breast cancer, our strategy\nleads to an average increase of 983 significant genes (out of 25,761) across\nall 18 experiments, with 14 generalizing to an increase on an independent\ndataset. Our findings reveal a high potential for injection of prior knowledge\nto increase gene expression prediction performance from WSIs across a wide\nrange of architectures.\n","authors":["Max Hallemeesch","Marija Pizurica","Paloma Rabaey","Olivier Gevaert","Thomas Demeester","Kathleen Marchal"],"pdf_url":"https://arxiv.org/pdf/2501.14056v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14051v1","updated":"2025-01-23T19:34:48Z","published":"2025-01-23T19:34:48Z","title":"Revisiting CLIP: Efficient Alignment of 3D MRI and Tabular Data using\n Domain-Specific Foundation Models","summary":" Multi-modal models require aligned, shared embedding spaces. However, common\nCLIP-based approaches need large amounts of samples and do not natively support\n3D or tabular data, both of which are crucial in the medical domain. To address\nthese issues, we revisit CLIP-style alignment by training a domain-specific 3D\nfoundation model as an image encoder and demonstrate that modality alignment is\nfeasible with only 62 MRI scans. Our approach is enabled by a simple embedding\naccumulation strategy required for training in 3D, which scales the amount of\nnegative pairs across batches in order to stabilize training. We perform a\nthorough evaluation of various design choices, including the choice of backbone\nand loss functions, and evaluate the proposed methodology on zero-shot\nclassification and image-retrieval tasks. While zero-shot image-retrieval\nremains challenging, zero-shot classification results demonstrate that the\nproposed approach can meaningfully align the representations of 3D MRI with\ntabular data.\n","authors":["Jakob Krogh Petersen","Valdemar Licht","Mads Nielsen","Asbjørn Munk"],"pdf_url":"https://arxiv.org/pdf/2501.14051v1.pdf","comment":"10 pages, 2 figures. To be published in ISBI 2025"},{"id":"http://arxiv.org/abs/2501.14048v1","updated":"2025-01-23T19:29:34Z","published":"2025-01-23T19:29:34Z","title":"SIDDA: SInkhorn Dynamic Domain Adaptation for Image Classification with\n Equivariant Neural Networks","summary":" Modern neural networks (NNs) often do not generalize well in the presence of\na \"covariate shift\"; that is, in situations where the training and test data\ndistributions differ, but the conditional distribution of classification labels\nremains unchanged. In such cases, NN generalization can be reduced to a problem\nof learning more domain-invariant features. Domain adaptation (DA) methods\ninclude a range of techniques aimed at achieving this; however, these methods\nhave struggled with the need for extensive hyperparameter tuning, which then\nincurs significant computational costs. In this work, we introduce SIDDA, an\nout-of-the-box DA training algorithm built upon the Sinkhorn divergence, that\ncan achieve effective domain alignment with minimal hyperparameter tuning and\ncomputational overhead. We demonstrate the efficacy of our method on multiple\nsimulated and real datasets of varying complexity, including simple shapes,\nhandwritten digits, and real astronomical observations. SIDDA is compatible\nwith a variety of NN architectures, and it works particularly well in improving\nclassification accuracy and model calibration when paired with equivariant\nneural networks (ENNs). We find that SIDDA enhances the generalization\ncapabilities of NNs, achieving up to a $\\approx40\\%$ improvement in\nclassification accuracy on unlabeled target data. We also study the efficacy of\nDA on ENNs with respect to the varying group orders of the dihedral group\n$D_N$, and find that the model performance improves as the degree of\nequivariance increases. Finally, we find that SIDDA enhances model calibration\non both source and target data--achieving over an order of magnitude\nimprovement in the ECE and Brier score. SIDDA's versatility, combined with its\nautomated approach to domain alignment, has the potential to advance\nmulti-dataset studies by enabling the development of highly generalizable\nmodels.\n","authors":["Sneh Pandya","Purvik Patel","Brian D. Nord","Mike Walmsley","Aleksandra Ćiprijanović"],"pdf_url":"https://arxiv.org/pdf/2501.14048v1.pdf","comment":"25 pages, 5 figures, 4 tables. code available at:\n https://github.com/deepskies/SIDDA"},{"id":"http://arxiv.org/abs/2501.14046v1","updated":"2025-01-23T19:26:14Z","published":"2025-01-23T19:26:14Z","title":"LLM-guided Instance-level Image Manipulation with Diffusion U-Net\n Cross-Attention Maps","summary":" The advancement of text-to-image synthesis has introduced powerful generative\nmodels capable of creating realistic images from textual prompts. However,\nprecise control over image attributes remains challenging, especially at the\ninstance level. While existing methods offer some control through fine-tuning\nor auxiliary information, they often face limitations in flexibility and\naccuracy. To address these challenges, we propose a pipeline leveraging Large\nLanguage Models (LLMs), open-vocabulary detectors, cross-attention maps and\nintermediate activations of diffusion U-Net for instance-level image\nmanipulation. Our method detects objects mentioned in the prompt and present in\nthe generated image, enabling precise manipulation without extensive training\nor input masks. By incorporating cross-attention maps, our approach ensures\ncoherence in manipulated images while controlling object positions. Our method\nenables precise manipulations at the instance level without fine-tuning or\nauxiliary information such as masks or bounding boxes. Code is available at\nhttps://github.com/Palandr123/DiffusionU-NetLLM\n","authors":["Andrey Palaev","Adil Khan","Syed M. Ahsan Kazmi"],"pdf_url":"https://arxiv.org/pdf/2501.14046v1.pdf","comment":"Presented at BMVC 2024"},{"id":"http://arxiv.org/abs/2501.14038v1","updated":"2025-01-23T19:11:53Z","published":"2025-01-23T19:11:53Z","title":"Implicit Neural Surface Deformation with Explicit Velocity Fields","summary":" In this work, we introduce the first unsupervised method that simultaneously\npredicts time-varying neural implicit surfaces and deformations between pairs\nof point clouds. We propose to model the point movement using an explicit\nvelocity field and directly deform a time-varying implicit field using the\nmodified level-set equation. This equation utilizes an iso-surface evolution\nwith Eikonal constraints in a compact formulation, ensuring the integrity of\nthe signed distance field. By applying a smooth, volume-preserving constraint\nto the velocity field, our method successfully recovers physically plausible\nintermediate shapes. Our method is able to handle both rigid and non-rigid\ndeformations without any intermediate shape supervision. Our experimental\nresults demonstrate that our method significantly outperforms existing works,\ndelivering superior results in both quality and efficiency.\n","authors":["Lu Sang","Zehranaz Canfes","Dongliang Cao","Florian Bernard","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2501.14038v1.pdf","comment":"ICLR 2025, 10 pages"},{"id":"http://arxiv.org/abs/2501.14014v1","updated":"2025-01-23T18:51:52Z","published":"2025-01-23T18:51:52Z","title":"INDIGO+: A Unified INN-Guided Probabilistic Diffusion Algorithm for\n Blind and Non-Blind Image Restoration","summary":" Generative diffusion models are becoming one of the most popular prior in\nimage restoration (IR) tasks due to their remarkable ability to generate\nrealistic natural images. Despite achieving satisfactory results, IR methods\nbased on diffusion models present several limitations. First of all, most\nnon-blind approaches require an analytical expression of the degradation model\nto guide the sampling process. Secondly, most existing blind approaches rely on\nfamilies of pre-defined degradation models for training their deep networks.\nThe above issues limit the flexibility of these approaches and so their ability\nto handle real-world degradation tasks. In this paper, we propose a novel\nINN-guided probabilistic diffusion algorithm for non-blind and blind image\nrestoration, namely INDIGO and BlindINDIGO, which combines the merits of the\nperfect reconstruction property of invertible neural networks (INN) with the\nstrong generative capabilities of pre-trained diffusion models. Specifically,\nwe train the forward process of the INN to simulate an arbitrary degradation\nprocess and use the inverse to obtain an intermediate image that we use to\nguide the reverse diffusion sampling process through a gradient step. We also\nintroduce an initialization strategy, to further improve the performance and\ninference speed of our algorithm. Experiments demonstrate that our algorithm\nobtains competitive results compared with recently leading methods both\nquantitatively and visually on synthetic and real-world low-quality images.\n","authors":["Di You","Pier Luigi Dragotti"],"pdf_url":"https://arxiv.org/pdf/2501.14014v1.pdf","comment":"Accepted by IEEE Journal of Selected Topics in Signal Processing\n (JSTSP)"},{"id":"http://arxiv.org/abs/2501.14013v1","updated":"2025-01-23T18:45:24Z","published":"2025-01-23T18:45:24Z","title":"Leveraging Multiphase CT for Quality Enhancement of Portal Venous CT:\n Utility for Pancreas Segmentation","summary":" Multiphase CT studies are routinely obtained in clinical practice for\ndiagnosis and management of various diseases, such as cancer. However, the CT\nstudies can be acquired with low radiation doses, different scanners, and are\nfrequently affected by motion and metal artifacts. Prior approaches have\ntargeted the quality improvement of one specific CT phase (e.g., non-contrast\nCT). In this work, we hypothesized that leveraging multiple CT phases for the\nquality enhancement of one phase may prove advantageous for downstream tasks,\nsuch as segmentation. A 3D progressive fusion and non-local (PFNL) network was\ndeveloped. It was trained with three degraded (low-quality) phases\n(non-contrast, arterial, and portal venous) to enhance the quality of the\nportal venous phase. Then, the effect of scan quality enhancement was evaluated\nusing a proxy task of pancreas segmentation, which is useful for tracking\npancreatic cancer. The proposed approach improved the pancreas segmentation by\n3% over the corresponding low-quality CT scan. To the best of our knowledge, we\nare the first to harness multiphase CT for scan quality enhancement and\nimproved pancreas segmentation.\n","authors":["Xinya Wang","Tejas Sudharshan Mathai","Boah Kim","Ronald M. Summers"],"pdf_url":"https://arxiv.org/pdf/2501.14013v1.pdf","comment":"ISBI 2025"}]},"2025-01-24T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.14678v1","updated":"2025-01-24T17:57:00Z","published":"2025-01-24T17:57:00Z","title":"A Predictive Approach for Enhancing Accuracy in Remote Robotic Surgery\n Using Informer Model","summary":" Precise and real-time estimation of the robotic arm's position on the\npatient's side is essential for the success of remote robotic surgery in\nTactile Internet (TI) environments. This paper presents a prediction model\nbased on the Transformer-based Informer framework for accurate and efficient\nposition estimation. Additionally, it combines a Four-State Hidden Markov Model\n(4-State HMM) to simulate realistic packet loss scenarios. The proposed\napproach addresses challenges such as network delays, jitter, and packet loss\nto ensure reliable and precise operation in remote surgical applications. The\nmethod integrates the optimization problem into the Informer model by embedding\nconstraints such as energy efficiency, smoothness, and robustness into its\ntraining process using a differentiable optimization layer. The Informer\nframework uses features such as ProbSparse attention, attention distilling, and\na generative-style decoder to focus on position-critical features while\nmaintaining a low computational complexity of O(L log L). The method is\nevaluated using the JIGSAWS dataset, achieving a prediction accuracy of over 90\npercent under various network scenarios. A comparison with models such as TCN,\nRNN, and LSTM demonstrates the Informer framework's superior performance in\nhandling position prediction and meeting real-time requirements, making it\nsuitable for Tactile Internet-enabled robotic surgery.\n","authors":["Muhammad Hanif Lashari","Shakil Ahmed","Wafa Batayneh","Ashfaq Khokhar"],"pdf_url":"https://arxiv.org/pdf/2501.14678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14672v1","updated":"2025-01-24T17:48:29Z","published":"2025-01-24T17:48:29Z","title":"Gaussian-Process-based Adaptive Tracking Control with Dynamic Active\n Learning for Autonomous Ground Vehicles","summary":" This article proposes an active-learning-based adaptive trajectory tracking\ncontrol method for autonomous ground vehicles to compensate for modeling errors\nand unmodeled dynamics. The nominal vehicle model is decoupled into lateral and\nlongitudinal subsystems, which are augmented with online Gaussian Processes\n(GPs), using measurement data. The estimated mean functions of the GPs are used\nto construct a feedback compensator, which, together with an LPV state feedback\ncontroller designed for the nominal system, gives the adaptive control\nstructure. To assist exploration of the dynamics, the paper proposes a new,\ndynamic active learning method to collect the most informative samples to\naccelerate the training process. To analyze the performance of the overall\nlearning tool-chain provided controller, a novel iterative,\ncounterexample-based algorithm is proposed for calculating the induced L2 gain\nbetween the reference trajectory and the tracking error. The analysis can be\nexecuted for a set of possible realizations of the to-be-controlled system,\ngiving robust performance certificate of the learning method under variation of\nthe vehicle dynamics. The efficiency of the proposed control approach is shown\non a high-fidelity physics simulator and in real experiments using a 1/10 scale\nF1TENTH electric car.\n","authors":["Kristóf Floch","Tamás Péni","Roland Tóth"],"pdf_url":"https://arxiv.org/pdf/2501.14672v1.pdf","comment":"Submitted to IEEE Transactions on Control Systems Technology"},{"id":"http://arxiv.org/abs/2409.18592v2","updated":"2025-01-24T17:21:45Z","published":"2024-09-27T09:51:45Z","title":"From One to the Power of Many: Invariance to Multi-LiDAR Perception from\n Single-Sensor Datasets","summary":" Recently, LiDAR segmentation methods for autonomous vehicles, powered by deep\nneural networks, have experienced steep growth in performance on classic\nbenchmarks, such as nuScenes and SemanticKITTI. However, there are still large\ngaps in performance when deploying models trained on such single-sensor setups\nto modern vehicles with multiple high-resolution LiDAR sensors. In this work,\nwe introduce a new metric for feature-level invariance which can serve as a\nproxy to measure cross-domain generalization without requiring labeled data.\nAdditionally, we propose two application-specific data augmentations, which\nfacilitate better transfer to multi-sensor LiDAR setups, when trained on\nsingle-sensor datasets. We provide experimental evidence on both simulated and\nreal data, that our proposed augmentations improve invariance across LiDAR\nsetups, leading to improved generalization.\n","authors":["Marc Uecker","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2409.18592v2.pdf","comment":"Accepted for publication at the ML4AD Workshop @ AAAI Conference 2025"},{"id":"http://arxiv.org/abs/2501.14616v1","updated":"2025-01-24T16:33:56Z","published":"2025-01-24T16:33:56Z","title":"QuIP: Experimental design for expensive simulators with many Qualitative\n factors via Integer Programming","summary":" The need to explore and/or optimize expensive simulators with many\nqualitative factors arises in broad scientific and engineering problems. Our\nmotivating application lies in path planning - the exploration of feasible\npaths for navigation, which plays an important role in robotics, surgical\nplanning and assembly planning. Here, the feasibility of a path is evaluated\nvia expensive virtual experiments, and its parameter space is typically\ndiscrete and high-dimensional. A carefully selected experimental design is thus\nessential for timely decision-making. We propose here a novel framework, called\nQuIP, for experimental design of Qualitative factors via Integer Programming\nunder a Gaussian process surrogate model with an exchangeable covariance\nfunction. For initial design, we show that its asymptotic D-optimal design can\nbe formulated as a variant of the well-known assignment problem in operations\nresearch, which can be efficiently solved to global optimality using\nstate-of-the-art integer programming solvers. For sequential design\n(specifically, for active learning or black-box optimization), we show that its\ndesign criterion can similarly be formulated as an assignment problem, thus\nenabling efficient and reliable optimization with existing solvers. We then\ndemonstrate the effectiveness of QuIP over existing methods in a suite of path\nplanning experiments and an application to rover trajectory optimization.\n","authors":["Yen-Chun Liu","Simon Mak"],"pdf_url":"https://arxiv.org/pdf/2501.14616v1.pdf","comment":"40 pages, 6 figures, submitted to JCGS"},{"id":"http://arxiv.org/abs/2501.14587v1","updated":"2025-01-24T15:48:41Z","published":"2025-01-24T15:48:41Z","title":"Visual Localization via Semantic Structures in Autonomous Photovoltaic\n Power Plant Inspection","summary":" Inspection systems utilizing unmanned aerial vehicles (UAVs) equipped with\nthermal cameras are increasingly popular for the maintenance of photovoltaic\n(PV) power plants. However, automation of the inspection task is a challenging\nproblem as it requires precise navigation to capture images from optimal\ndistances and viewing angles.\n This paper presents a novel localization pipeline that directly integrates PV\nmodule detection with UAV navigation, allowing precise positioning during\ninspection. Detections are used to identify the power plant structures in the\nimage and associate these with the power plant model. We define visually\nrecognizable anchor points for the initial association and use object tracking\nto discern global associations. We present three distinct methods for visual\nsegmentation of PV modules based on traditional computer vision, deep learning,\nand their fusion, and we evaluate their performance in relation to the proposed\nlocalization pipeline.\n The presented methods were verified and evaluated using custom aerial\ninspection data sets, demonstrating their robustness and applicability for\nreal-time navigation. Additionally, we evaluate the influence of the power\nplant model's precision on the localization methods.\n","authors":["Viktor Kozák","Karel Košnar","Jan Chudoba","Miroslav Kulich","Libor Přeučil"],"pdf_url":"https://arxiv.org/pdf/2501.14587v1.pdf","comment":"47 pages, 22 figures"},{"id":"http://arxiv.org/abs/2501.14557v1","updated":"2025-01-24T15:02:18Z","published":"2025-01-24T15:02:18Z","title":"Optimizing Grasping Precision for Industrial Pick-and-Place Tasks\n Through a Novel Visual Servoing Approach","summary":" The integration of robotic arm manipulators into industrial manufacturing\nlines has become common, thanks to their efficiency and effectiveness in\nexecuting specific tasks. With advancements in camera technology, visual\nsensors and perception systems have been incorporated to address more complex\noperations. This study introduces a novel visual serving control system\ndesigned for robotic operations in challenging environments, where accurate\nobject pose estimation is hindered by factors such as vibrations, tool path\ndeviations, and machining marks. To overcome these obstacles, our solution\nfocuses on enhancing the accuracy of picking and placing tasks, ensuring\nreliable performance across various scenarios. This is accomplished by a novel\nvisual servoing method based on the integration of two complementary\nmethodologies: a technique for object localization and a separate approach for\nprecise control through visual feedback, leveraging their strengths to address\nthe challenges posed by the industrial context and thereby improving overall\ngrasping accuracy. Our method employ feedback from perception sensors to adjust\nthe control loop efficiently, enabling the robotic system to adeptly pick and\nplace objects. We have introduced a controller capable of seamlessly managing\nthe detection and manipulation of various shapes and types of objects within an\nindustrial context, addressing numerous challenges that arise in such\nenvironments.\n","authors":["Khairidine Benali"],"pdf_url":"https://arxiv.org/pdf/2501.14557v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14526v1","updated":"2025-01-24T14:29:58Z","published":"2025-01-24T14:29:58Z","title":"Robustified Time-optimal Point-to-point Motion Planning and Control\n under Uncertainty","summary":" This paper proposes a novel approach to formulate time-optimal point-to-point\nmotion planning and control under uncertainty. The approach defines a\nrobustified two-stage Optimal Control Problem (OCP), in which stage 1, with a\nfixed time grid, is seamlessly stitched with stage 2, which features a variable\ntime grid. Stage 1 optimizes not only the nominal trajectory, but also feedback\ngains and corresponding state covariances, which robustify constraints in both\nstages. The outcome is a minimized uncertainty in stage 1 and a minimized total\nmotion time for stage 2, both contributing to the time optimality and safety of\nthe total motion. A timely replanning strategy is employed to handle changes in\nconstraints and maintain feasibility, while a tailored iterative algorithm is\nproposed for efficient, real-time OCP execution.\n","authors":["Shuhao Zhang","Jan Swevers"],"pdf_url":"https://arxiv.org/pdf/2501.14526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14513v1","updated":"2025-01-24T14:18:22Z","published":"2025-01-24T14:18:22Z","title":"ABPT: Amended Backpropagation through Time with Partially Differentiable\n Rewards","summary":" Using the exact gradients of the rewards to directly optimize policy\nparameters via backpropagation-through-time (BPTT) enables high training\nperformance for quadrotor tasks. However, designing a fully differentiable\nreward architecture is often challenging. Partially differentiable rewards will\nresult in biased gradient propagation that degrades training performance. To\novercome this limitation, we propose Amended Backpropagation-through-Time\n(ABPT), a novel approach that mitigates gradient bias while preserving the\ntraining efficiency of BPTT. ABPT combines 0-step and N-step returns,\neffectively reducing the bias by leveraging value gradients from the learned\nQ-value function. Additionally, it adopts entropy regularization and state\ninitialization mechanisms to encourage exploration during training. We evaluate\nABPT on four representative quadrotor flight tasks. Experimental results\ndemonstrate that ABPT converges significantly faster and achieves higher\nultimate rewards than existing learning algorithms, particularly in tasks\ninvolving partially differentiable rewards.\n","authors":["Fanxing Li","Fangyu Sun","Tianbao Zhang","Danping Zou"],"pdf_url":"https://arxiv.org/pdf/2501.14513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14503v1","updated":"2025-01-24T14:01:53Z","published":"2025-01-24T14:01:53Z","title":"Benchmarking global optimization techniques for unmanned aerial vehicle\n path planning","summary":" The Unmanned Aerial Vehicle (UAV) path planning problem is a complex\noptimization problem in the field of robotics. In this paper, we investigate\nthe possible utilization of this problem in benchmarking global optimization\nmethods. We devise a problem instance generator and pick 56 representative\ninstances, which we compare to established benchmarking suits through\nExploratory Landscape Analysis to show their uniqueness. For the computational\ncomparison, we select twelve well-performing global optimization techniques\nfrom both subfields of stochastic algorithms (evolutionary computation methods)\nand deterministic algorithms (Dividing RECTangles, or DIRECT-type methods). The\nexperiments were conducted in settings with varying dimensionality and\ncomputational budgets. The results were analyzed through several criteria\n(number of best-found solutions, mean relative error, Friedman ranks) and\nutilized established statistical tests. The best-ranking methods for the UAV\nproblems were almost universally the top-performing evolutionary techniques\nfrom recent competitions on numerical optimization at the Institute of\nElectrical and Electronics Engineers Congress on Evolutionary Computation.\nLastly, we discussed the variable dimension characteristics of the studied UAV\nproblems that remain still largely under-investigated.\n","authors":["Mhd Ali Shehadeh","Jakub Kudela"],"pdf_url":"https://arxiv.org/pdf/2501.14503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14502v1","updated":"2025-01-24T14:01:51Z","published":"2025-01-24T14:01:51Z","title":"LiDAR-Based Vehicle Detection and Tracking for Autonomous Racing","summary":" Autonomous racing provides a controlled environment for testing the software\nand hardware of autonomous vehicles operating at their performance limits.\nCompetitive interactions between multiple autonomous racecars however introduce\nchallenging and potentially dangerous scenarios. Accurate and consistent\nvehicle detection and tracking is crucial for overtaking maneuvers, and\nlow-latency sensor processing is essential to respond quickly to hazardous\nsituations. This paper presents the LiDAR-based perception algorithms deployed\non Team PoliMOVE's autonomous racecar, which won multiple competitions in the\nIndy Autonomous Challenge series. Our Vehicle Detection and Tracking pipeline\nis composed of a novel fast Point Cloud Segmentation technique and a specific\nVehicle Pose Estimation methodology, together with a variable-step Multi-Target\nTracking algorithm. Experimental results demonstrate the algorithm's\nperformance, robustness, computational efficiency, and suitability for\nautonomous racing applications, enabling fully autonomous overtaking maneuvers\nat velocities exceeding 275 km/h.\n","authors":["Marcello Cellina","Matteo Corno","Sergio Matteo Savaresi"],"pdf_url":"https://arxiv.org/pdf/2501.14502v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2501.14486v1","updated":"2025-01-24T13:40:33Z","published":"2025-01-24T13:40:33Z","title":"Visual-Lidar Map Alignment for Infrastructure Inspections","summary":" Routine and repetitive infrastructure inspections present safety, efficiency,\nand consistency challenges as they are performed manually, often in challenging\nor hazardous environments. They can also introduce subjectivity and errors into\nthe process, resulting in undesirable outcomes. Simultaneous localization and\nmapping (SLAM) presents an opportunity to generate high-quality 3D maps that\ncan be used to extract accurate and objective inspection data. Yet, many SLAM\nalgorithms are limited in their ability to align 3D maps from repeated\ninspections in GPS-denied settings automatically. This limitation hinders\npractical long-term asset health assessments by requiring tedious manual\nalignment for data association across scans from previous inspections. This\npaper introduces a versatile map alignment algorithm leveraging both visual and\nlidar data for improved place recognition robustness and presents an\ninfrastructure-focused dataset tailored for consecutive inspections. By\ndetaching map alignment from SLAM, our approach enhances infrastructure\ninspection pipelines, supports monitoring asset degradation over time, and\ninvigorates SLAM research by permitting exploration beyond existing\nmulti-session SLAM algorithms.\n","authors":["Jake McLaughlin","Nicholas Charron","Sriram Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2501.14486v1.pdf","comment":"8 pages, 8 figures, for associated code see\n https://github.com/jakemclaughlin6/vlma"},{"id":"http://arxiv.org/abs/2501.06605v3","updated":"2025-01-24T13:29:33Z","published":"2025-01-11T18:11:07Z","title":"RoboHorizon: An LLM-Assisted Multi-View World Model for Long-Horizon\n Robotic Manipulation","summary":" Efficient control in long-horizon robotic manipulation is challenging due to\ncomplex representation and policy learning requirements. Model-based visual\nreinforcement learning (RL) has shown great potential in addressing these\nchallenges but still faces notable limitations, particularly in handling sparse\nrewards and complex visual features in long-horizon environments. To address\nthese limitations, we propose the Recognize-Sense-Plan-Act (RSPA) pipeline for\nlong-horizon tasks and further introduce RoboHorizon, an LLM-assisted\nmulti-view world model tailored for long-horizon robotic manipulation. In\nRoboHorizon, pre-trained LLMs generate dense reward structures for multi-stage\nsub-tasks based on task language instructions, enabling robots to better\nrecognize long-horizon tasks. Keyframe discovery is then integrated into the\nmulti-view masked autoencoder (MAE) architecture to enhance the robot's ability\nto sense critical task sequences, strengthening its multi-stage perception of\nlong-horizon processes. Leveraging these dense rewards and multi-view\nrepresentations, a robotic world model is constructed to efficiently plan\nlong-horizon tasks, enabling the robot to reliably act through RL algorithms.\nExperiments on two representative benchmarks, RLBench and FurnitureBench, show\nthat RoboHorizon outperforms state-of-the-art visual model-based RL methods,\nachieving a 23.35% improvement in task success rates on RLBench's 4\nshort-horizon tasks and a 29.23% improvement on 6 long-horizon tasks from\nRLBench and 3 furniture assembly tasks from FurnitureBench.\n","authors":["Zixuan Chen","Jing Huo","Yangtao Chen","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2501.06605v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2501.14451v1","updated":"2025-01-24T12:34:04Z","published":"2025-01-24T12:34:04Z","title":"MARL-OT: Multi-Agent Reinforcement Learning Guided Online Fuzzing to\n Detect Safety Violation in Autonomous Driving Systems","summary":" Autonomous Driving Systems (ADSs) are safety-critical, as real-world safety\nviolations can result in significant losses. Rigorous testing is essential\nbefore deployment, with simulation testing playing a key role. However, ADSs\nare typically complex, consisting of multiple modules such as perception and\nplanning, or well-trained end-to-end autonomous driving systems. Offline\nmethods, such as the Genetic Algorithm (GA), can only generate predefined\ntrajectories for dynamics, which struggle to cause safety violations for ADSs\nrapidly and efficiently in different scenarios due to their evolutionary\nnature. Online methods, such as single-agent reinforcement learning (RL), can\nquickly adjust the dynamics' trajectory online to adapt to different scenarios,\nbut they struggle to capture complex corner cases of ADS arising from the\nintricate interplay among multiple vehicles. Multi-agent reinforcement learning\n(MARL) has a strong ability in cooperative tasks. On the other hand, it faces\nits own challenges, particularly with convergence. This paper introduces\nMARL-OT, a scalable framework that leverages MARL to detect safety violations\nof ADS resulting from surrounding vehicles' cooperation. MARL-OT employs MARL\nfor high-level guidance, triggering various dangerous scenarios for the\nrule-based online fuzzer to explore potential safety violations of ADS, thereby\ngenerating dynamic, realistic safety violation scenarios. Our approach improves\nthe detected safety violation rate by up to 136.2% compared to the\nstate-of-the-art (SOTA) testing technique.\n","authors":["Linfeng Liang","Xi Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.14451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14443v1","updated":"2025-01-24T12:23:12Z","published":"2025-01-24T12:23:12Z","title":"Learning more with the same effort: how randomization improves the\n robustness of a robotic deep reinforcement learning agent","summary":" The industrial application of Deep Reinforcement Learning (DRL) is frequently\nslowed down because of the inability to generate the experience required to\ntrain the models. Collecting data often involves considerable time and economic\neffort that is unaffordable in most cases. Fortunately, devices like robots can\nbe trained with synthetic experience thanks to virtual environments. With this\napproach, the sample efficiency problems of artificial agents are mitigated,\nbut another issue arises: the need for efficiently transferring the synthetic\nexperience into the real world (sim-to-real).\n This paper analyzes the robustness of a state-of-the-art sim-to-real\ntechnique known as progressive neural networks (PNNs) and studies how adding\ndiversity to the synthetic experience can complement it. To better understand\nthe drivers that lead to a lack of robustness, the robotic agent is still\ntested in a virtual environment to ensure total control on the divergence\nbetween the simulated and real models.\n The results show that a PNN-like agent exhibits a substantial decrease in its\nrobustness at the beginning of the real training phase. Randomizing certain\nvariables during simulation-based training significantly mitigates this issue.\nOn average, the increase in the model's accuracy is around 25% when diversity\nis introduced in the training process. This improvement can be translated into\na decrease in the required real experience for the same final robustness\nperformance. Notwithstanding, adding real experience to agents should still be\nbeneficial regardless of the quality of the virtual experience fed into the\nagent.\n","authors":["Lucía Güitta-López","Jaime Boal","Álvaro J. López-López"],"pdf_url":"https://arxiv.org/pdf/2501.14443v1.pdf","comment":"This article was accepted and published in Applied Intelligence\n (10.1007/s10489-022-04227-3)"},{"id":"http://arxiv.org/abs/2501.14400v1","updated":"2025-01-24T11:11:53Z","published":"2025-01-24T11:11:53Z","title":"SKIL: Semantic Keypoint Imitation Learning for Generalizable\n Data-efficient Manipulation","summary":" Real-world tasks such as garment manipulation and table rearrangement demand\nrobots to perform generalizable, highly precise, and long-horizon actions.\nAlthough imitation learning has proven to be an effective approach for teaching\nrobots new skills, large amounts of expert demonstration data are still\nindispensible for these complex tasks, resulting in high sample complexity and\ncostly data collection. To address this, we propose Semantic Keypoint Imitation\nLearning (SKIL), a framework which automatically obtain semantic keypoints with\nhelp of vision foundation models, and forms the descriptor of semantic\nkeypoints that enables effecient imitation learning of complex robotic tasks\nwith significantly lower sample complexity. In real world experiments, SKIL\ndoubles the performance of baseline methods in tasks such as picking a cup or\nmouse, while demonstrating exceptional robustness to variations in objects,\nenvironmental changes, and distractors. For long-horizon tasks like hanging a\ntowel on a rack where previous methods fail completely, SKIL achieves a mean\nsuccess rate of 70\\% with as few as 30 demonstrations. Furthermore, SKIL\nnaturally supports cross-embodiment learning due to its semantic keypoints\nabstraction, our experiments demonstrate that even human videos bring\nconsiderable improvement to the learning performance. All these results\ndemonstrate the great success of SKIL in achieving data-efficint generalizable\nrobotic learning. Visualizations and code are available at:\nhttps://skil-robotics.github.io/SKIL-robotics/.\n","authors":["Shengjie Wang","Jiacheng You","Yihang Hu","Jiongye Li","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2501.14400v1.pdf","comment":"22 pages, 22 figures"},{"id":"http://arxiv.org/abs/2410.23085v3","updated":"2025-01-24T10:46:53Z","published":"2024-10-30T15:00:06Z","title":"S3PT: Scene Semantics and Structure Guided Clustering to Boost\n Self-Supervised Pre-Training for Autonomous Driving","summary":" Recent self-supervised clustering-based pre-training techniques like DINO and\nCribo have shown impressive results for downstream detection and segmentation\ntasks. However, real-world applications such as autonomous driving face\nchallenges with imbalanced object class and size distributions and complex\nscene geometries. In this paper, we propose S3PT a novel scene semantics and\nstructure guided clustering to provide more scene-consistent objectives for\nself-supervised training. Specifically, our contributions are threefold: First,\nwe incorporate semantic distribution consistent clustering to encourage better\nrepresentation of rare classes such as motorcycles or animals. Second, we\nintroduce object diversity consistent spatial clustering, to handle imbalanced\nand diverse object sizes, ranging from large background areas to small objects\nsuch as pedestrians and traffic signs. Third, we propose a depth-guided spatial\nclustering to regularize learning based on geometric information of the scene,\nthus further refining region separation on the feature level. Our learned\nrepresentations significantly improve performance in downstream semantic\nsegmentation and 3D object detection tasks on the nuScenes, nuImages, and\nCityscapes datasets and show promising domain translation properties.\n","authors":["Maciej K. Wozniak","Hariprasath Govindarajan","Marvin Klingner","Camille Maurice","B Ravi Kiran","Senthil Yogamani"],"pdf_url":"https://arxiv.org/pdf/2410.23085v3.pdf","comment":"Accepted for WACV 2025 (Oral)"},{"id":"http://arxiv.org/abs/2501.14377v1","updated":"2025-01-24T10:24:39Z","published":"2025-01-24T10:24:39Z","title":"Dream to Fly: Model-Based Reinforcement Learning for Vision-Based Drone\n Flight","summary":" Autonomous drone racing has risen as a challenging robotic benchmark for\ntesting the limits of learning, perception, planning, and control. Expert human\npilots are able to agilely fly a drone through a race track by mapping the\nreal-time feed from a single onboard camera directly to control commands.\nRecent works in autonomous drone racing attempting direct pixel-to-commands\ncontrol policies (without explicit state estimation) have relied on either\nintermediate representations that simplify the observation space or performed\nextensive bootstrapping using Imitation Learning (IL). This paper introduces an\napproach that learns policies from scratch, allowing a quadrotor to\nautonomously navigate a race track by directly mapping raw onboard camera\npixels to control commands, just as human pilots do. By leveraging model-based\nreinforcement learning~(RL) - specifically DreamerV3 - we train visuomotor\npolicies capable of agile flight through a race track using only raw pixel\nobservations. While model-free RL methods such as PPO struggle to learn under\nthese conditions, DreamerV3 efficiently acquires complex visuomotor behaviors.\nMoreover, because our policies learn directly from pixel inputs, the\nperception-aware reward term employed in previous RL approaches to guide the\ntraining process is no longer needed. Our experiments demonstrate in both\nsimulation and real-world flight how the proposed approach can be deployed on\nagile quadrotors. This approach advances the frontier of vision-based\nautonomous flight and shows that model-based RL is a promising direction for\nreal-world robotics.\n","authors":["Angel Romero","Ashwin Shenai","Ismail Geles","Elie Aljalbout","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2501.14377v1.pdf","comment":"11 pages, 7 Figures"},{"id":"http://arxiv.org/abs/2501.14319v1","updated":"2025-01-24T08:25:48Z","published":"2025-01-24T08:25:48Z","title":"Scalable Benchmarking and Robust Learning for Noise-Free Ego-Motion and\n 3D Reconstruction from Noisy Video","summary":" We aim to redefine robust ego-motion estimation and photorealistic 3D\nreconstruction by addressing a critical limitation: the reliance on noise-free\ndata in existing models. While such sanitized conditions simplify evaluation,\nthey fail to capture the unpredictable, noisy complexities of real-world\nenvironments. Dynamic motion, sensor imperfections, and synchronization\nperturbations lead to sharp performance declines when these models are deployed\nin practice, revealing an urgent need for frameworks that embrace and excel\nunder real-world noise. To bridge this gap, we tackle three core challenges:\nscalable data generation, comprehensive benchmarking, and model robustness\nenhancement. First, we introduce a scalable noisy data synthesis pipeline that\ngenerates diverse datasets simulating complex motion, sensor imperfections, and\nsynchronization errors. Second, we leverage this pipeline to create\nRobust-Ego3D, a benchmark rigorously designed to expose noise-induced\nperformance degradation, highlighting the limitations of current learning-based\nmethods in ego-motion accuracy and 3D reconstruction quality. Third, we propose\nCorrespondence-guided Gaussian Splatting (CorrGS), a novel test-time adaptation\nmethod that progressively refines an internal clean 3D representation by\naligning noisy observations with rendered RGB-D frames from clean 3D map,\nenhancing geometric alignment and appearance restoration through visual\ncorrespondence. Extensive experiments on synthetic and real-world data\ndemonstrate that CorrGS consistently outperforms prior state-of-the-art\nmethods, particularly in scenarios involving rapid motion and dynamic\nillumination.\n","authors":["Xiaohao Xu","Tianyi Zhang","Shibo Zhao","Xiang Li","Sibo Wang","Yongqi Chen","Ye Li","Bhiksha Raj","Matthew Johnson-Roberson","Sebastian Scherer","Xiaonan Huang"],"pdf_url":"https://arxiv.org/pdf/2501.14319v1.pdf","comment":"Accepted by ICLR 2025; 92 Pages; Project Repo:\n https://github.com/Xiaohao-Xu/SLAM-under-Perturbation. arXiv admin note:\n substantial text overlap with arXiv:2406.16850"},{"id":"http://arxiv.org/abs/2501.14280v1","updated":"2025-01-24T06:51:48Z","published":"2025-01-24T06:51:48Z","title":"Enhancing Robotic Precision in Construction: A Modular Factor\n Graph-Based Framework to Deflection and Backlash Compensation Using\n High-Accuracy Accelerometers","summary":" Accurate positioning is crucial in the construction industry, where labor\nshortages highlight the need for automation. Robotic systems with long\nkinematic chains are required to reach complex workspaces, including floors,\nwalls, and ceilings. These requirements significantly impact positioning\naccuracy due to effects such as deflection and backlash in various parts along\nthe kinematic chain. In this work, we introduce a novel approach that\nintegrates deflection and backlash compensation models with high-accuracy\naccelerometers, significantly enhancing position accuracy. Our method employs a\nmodular framework based on a factor graph formulation to estimate the state of\nthe kinematic chain, leveraging acceleration measurements to inform the model.\nExtensive testing on publicly released datasets, reflecting real-world\nconstruction disturbances, demonstrates the advantages of our approach. The\nproposed method reduces the $95\\%$ error threshold in the xy-plane by $50\\%$\ncompared to the state-of-the-art Virtual Joint Method, and by $31\\%$ when\nincorporating base tilt compensation.\n","authors":["Julien Kindle","Michael Loetscher","Andrea Alessandretti","Cesar Cadena","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2501.14280v1.pdf","comment":"8 pages, 7 figures, Accepted on November 2024 at IEEE Robotics and\n Automation Letters"},{"id":"http://arxiv.org/abs/2501.14238v1","updated":"2025-01-24T04:50:16Z","published":"2025-01-24T04:50:16Z","title":"Point-LN: A Lightweight Framework for Efficient Point Cloud\n Classification Using Non-Parametric Positional Encoding","summary":" We introduce Point-LN, a novel lightweight framework engineered for efficient\n3D point cloud classification. Point-LN integrates essential non-parametric\ncomponents-such as Farthest Point Sampling (FPS), k-Nearest Neighbors (k-NN),\nand non-learnable positional encoding-with a streamlined learnable classifier\nthat significantly enhances classification accuracy while maintaining a minimal\nparameter footprint. This hybrid architecture ensures low computational costs\nand rapid inference speeds, making Point-LN ideal for real-time and\nresource-constrained applications. Comprehensive evaluations on benchmark\ndatasets, including ModelNet40 and ScanObjectNN, demonstrate that Point-LN\nachieves competitive performance compared to state-of-the-art methods, all\nwhile offering exceptional efficiency. These results establish Point-LN as a\nrobust and scalable solution for diverse point cloud classification tasks,\nhighlighting its potential for widespread adoption in various computer vision\napplications.\n","authors":["Marzieh Mohammadi","Amir Salarpour","Pedram MohajerAnsari"],"pdf_url":"https://arxiv.org/pdf/2501.14238v1.pdf","comment":"This paper has been accepted for presentation at the 29th\n International Computer Conference, Computer Society of Iran (CSICC) 2025"},{"id":"http://arxiv.org/abs/2501.14208v1","updated":"2025-01-24T03:26:41Z","published":"2025-01-24T03:26:41Z","title":"You Only Teach Once: Learn One-Shot Bimanual Robotic Manipulation from\n Video Demonstrations","summary":" Bimanual robotic manipulation is a long-standing challenge of embodied\nintelligence due to its characteristics of dual-arm spatial-temporal\ncoordination and high-dimensional action spaces. Previous studies rely on\npre-defined action taxonomies or direct teleoperation to alleviate or\ncircumvent these issues, often making them lack simplicity, versatility and\nscalability. Differently, we believe that the most effective and efficient way\nfor teaching bimanual manipulation is learning from human demonstrated videos,\nwhere rich features such as spatial-temporal positions, dynamic postures,\ninteraction states and dexterous transitions are available almost for free. In\nthis work, we propose the YOTO (You Only Teach Once), which can extract and\nthen inject patterns of bimanual actions from as few as a single binocular\nobservation of hand movements, and teach dual robot arms various complex tasks.\nFurthermore, based on keyframes-based motion trajectories, we devise a subtle\nsolution for rapidly generating training demonstrations with diverse variations\nof manipulated objects and their locations. These data can then be used to\nlearn a customized bimanual diffusion policy (BiDP) across diverse scenes. In\nexperiments, YOTO achieves impressive performance in mimicking 5 intricate\nlong-horizon bimanual tasks, possesses strong generalization under different\nvisual and spatial conditions, and outperforms existing visuomotor imitation\nlearning methods in accuracy and efficiency. Our project link is\nhttps://hnuzhy.github.io/projects/YOTO.\n","authors":["Huayi Zhou","Ruixiang Wang","Yunxin Tai","Yueci Deng","Guiliang Liu","Kui Jia"],"pdf_url":"https://arxiv.org/pdf/2501.14208v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2501.14151v1","updated":"2025-01-24T00:34:51Z","published":"2025-01-24T00:34:51Z","title":"RaccoonBot: An Autonomous Wire-Traversing Solar-Tracking Robot for\n Persistent Environmental Monitoring","summary":" Environmental monitoring is used to characterize the health and relationship\nbetween organisms and their environments. In forest ecosystems, robots can\nserve as platforms to acquire such data, even in hard-to-reach places where\nwire-traversing platforms are particularly promising due to their efficient\ndisplacement. This paper presents the RaccoonBot, which is a novel autonomous\nwire-traversing robot for persistent environmental monitoring, featuring a\nfail-safe mechanical design with a self-locking mechanism in case of electrical\nshortage. The robot also features energy-aware mobility through a novel Solar\ntracking algorithm, that allows the robot to find a position on the wire to\nhave direct contact with solar power to increase the energy harvested.\nExperimental results validate the electro-mechanical features of the\nRaccoonBot, showing that it is able to handle wire perturbations, different\ninclinations, and achieving energy autonomy.\n","authors":["Efrain Mendez-Flores","Agaton Pourshahidi","Magnus Egerstedt"],"pdf_url":"https://arxiv.org/pdf/2501.14151v1.pdf","comment":"Pre-print submitted to the 2025 IEEE International Conference on\n Robotics & Automation (ICRA 2025)"},{"id":"http://arxiv.org/abs/2501.14147v1","updated":"2025-01-24T00:21:10Z","published":"2025-01-24T00:21:10Z","title":"HAMMER: Heterogeneous, Multi-Robot Semantic Gaussian Splatting","summary":" 3D Gaussian Splatting offers expressive scene reconstruction, modeling a\nbroad range of visual, geometric, and semantic information. However, efficient\nreal-time map reconstruction with data streamed from multiple robots and\ndevices remains a challenge. To that end, we propose HAMMER, a server-based\ncollaborative Gaussian Splatting method that leverages widely available ROS\ncommunication infrastructure to generate 3D, metric-semantic maps from\nasynchronous robot data-streams with no prior knowledge of initial robot\npositions and varying on-device pose estimators. HAMMER consists of (i) a frame\nalignment module that transforms local SLAM poses and image data into a global\nframe and requires no prior relative pose knowledge, and (ii) an online module\nfor training semantic 3DGS maps from streaming data. HAMMER handles mixed\nperception modes, adjusts automatically for variations in image pre-processing\namong different devices, and distills CLIP semantic codes into the 3D scene for\nopen-vocabulary language queries. In our real-world experiments, HAMMER creates\nhigher-fidelity maps (2x) compared to competing baselines and is useful for\ndownstream tasks, such as semantic goal-conditioned navigation (e.g., ``go to\nthe couch\"). Accompanying content available at hammer-project.github.io.\n","authors":["Javier Yu","Timothy Chen","Mac Schwager"],"pdf_url":"https://arxiv.org/pdf/2501.14147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14942v1","updated":"2025-01-24T22:01:23Z","published":"2025-01-24T22:01:23Z","title":"Force-Based Robotic Imitation Learning: A Two-Phase Approach for\n Construction Assembly Tasks","summary":" The drive for efficiency and safety in construction has boosted the role of\nrobotics and automation. However, complex tasks like welding and pipe insertion\npose challenges due to their need for precise adaptive force control, which\ncomplicates robotic training. This paper proposes a two-phase system to improve\nrobot learning, integrating human-derived force feedback. The first phase\ncaptures real-time data from operators using a robot arm linked with a virtual\nsimulator via ROS-Sharp. In the second phase, this feedback is converted into\nrobotic motion instructions, using a generative approach to incorporate force\nfeedback into the learning process. This method's effectiveness is demonstrated\nthrough improved task completion times and success rates. The framework\nsimulates realistic force-based interactions, enhancing the training data's\nquality for precise robotic manipulation in construction tasks.\n","authors":["Hengxu You","Yang Ye","Tianyu Zhou","Jing Du"],"pdf_url":"https://arxiv.org/pdf/2501.14942v1.pdf","comment":"36 pages"},{"id":"http://arxiv.org/abs/2501.14934v1","updated":"2025-01-24T21:47:38Z","published":"2025-01-24T21:47:38Z","title":"Temporal Binding Foundation Model for Material Property Recognition via\n Tactile Sequence Perception","summary":" Robots engaged in complex manipulation tasks require robust material property\nrecognition to ensure adaptability and precision. Traditionally, visual data\nhas been the primary source for object perception; however, it often proves\ninsufficient in scenarios where visibility is obstructed or detailed\nobservation is needed. This gap highlights the necessity of tactile sensing as\na complementary or primary input for material recognition. Tactile data becomes\nparticularly essential in contact-rich, small-scale manipulations where subtle\ndeformations and surface interactions cannot be accurately captured by vision\nalone. This letter presents a novel approach leveraging a temporal binding\nfoundation model for tactile sequence understanding to enhance material\nproperty recognition. By processing tactile sensor data with a temporal focus,\nthe proposed system captures the sequential nature of tactile interactions,\nsimilar to human fingertip perception. Additionally, this letter demonstrates\nthat, through tailored and specific design, the foundation model can more\neffectively capture temporal information embedded in tactile sequences,\nadvancing material property understanding. Experimental results validate the\nmodel's capability to capture these temporal patterns, confirming its utility\nfor material property recognition in visually restricted scenarios. This work\nunderscores the necessity of embedding advanced tactile data processing\nframeworks within robotic systems to achieve truly embodied and responsive\nmanipulation capabilities.\n","authors":["Hengxu You","Tianyu Zhou","Jing Du"],"pdf_url":"https://arxiv.org/pdf/2501.14934v1.pdf","comment":"4 pages,"},{"id":"http://arxiv.org/abs/2308.10966v6","updated":"2025-01-24T21:32:20Z","published":"2023-08-21T18:23:53Z","title":"Deadlock-free, Safe, and Decentralized Multi-Robot Navigation in Social\n Mini-Games via Discrete-Time Control Barrier Functions","summary":" We present an approach to ensure safe and deadlock-free navigation for\ndecentralized multi-robot systems operating in constrained environments,\nincluding doorways and intersections. Although many solutions have been\nproposed that ensure safety and resolve deadlocks, optimally preventing\ndeadlocks in a minimally invasive and decentralized fashion remains an open\nproblem. We first formalize the objective as a non-cooperative,\nnon-communicative, partially observable multi-robot navigation problem in\nconstrained spaces with multiple conflicting agents, which we term as social\nmini-games. Formally, we solve a discrete-time optimal receding horizon control\nproblem leveraging control barrier functions for safe long-horizon planning.\nOur approach to ensuring liveness rests on the insight that \\textit{there\nexists barrier certificates that allow each robot to preemptively perturb their\nstate in a minimally-invasive fashion onto liveness sets i.e. states where\nrobots are deadlock-free}. We evaluate our approach in simulation as well on\nphysical robots using F$1/10$ robots, a Clearpath Jackal, as well as a Boston\nDynamics Spot in a doorway, hallway, and corridor intersection scenario.\nCompared to both fully decentralized and centralized approaches with and\nwithout deadlock resolution capabilities, we demonstrate that our approach\nresults in safer, more efficient, and smoother navigation, based on a\ncomprehensive set of metrics including success rate, collision rate, stop time,\nchange in velocity, path deviation, time-to-goal, and flow rate.\n","authors":["Rohan Chandra","Vrushabh Zinage","Efstathios Bakolas","Peter Stone","Joydeep Biswas"],"pdf_url":"https://arxiv.org/pdf/2308.10966v6.pdf","comment":"major update since last revision"},{"id":"http://arxiv.org/abs/2501.14856v1","updated":"2025-01-24T17:15:49Z","published":"2025-01-24T17:15:49Z","title":"Noise-conditioned Energy-based Annealed Rewards (NEAR): A Generative\n Framework for Imitation Learning from Observation","summary":" This paper introduces a new imitation learning framework based on\nenergy-based generative models capable of learning complex, physics-dependent,\nrobot motion policies through state-only expert motion trajectories. Our\nalgorithm, called Noise-conditioned Energy-based Annealed Rewards (NEAR),\nconstructs several perturbed versions of the expert's motion data distribution\nand learns smooth, and well-defined representations of the data distribution's\nenergy function using denoising score matching. We propose to use these learnt\nenergy functions as reward functions to learn imitation policies via\nreinforcement learning. We also present a strategy to gradually switch between\nthe learnt energy functions, ensuring that the learnt rewards are always\nwell-defined in the manifold of policy-generated samples. We evaluate our\nalgorithm on complex humanoid tasks such as locomotion and martial arts and\ncompare it with state-only adversarial imitation learning algorithms like\nAdversarial Motion Priors (AMP). Our framework sidesteps the optimisation\nchallenges of adversarial imitation learning techniques and produces results\ncomparable to AMP in several quantitative metrics across multiple imitation\nsettings.\n","authors":["Anish Abhijit Diwan","Julen Urain","Jens Kober","Jan Peters"],"pdf_url":"https://arxiv.org/pdf/2501.14856v1.pdf","comment":"Accepted as a conference paper at the International Conference on\n Learning Representations (ICLR) 2025"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.14729v1","updated":"2025-01-24T18:59:51Z","published":"2025-01-24T18:59:51Z","title":"HERMES: A Unified Self-Driving World Model for Simultaneous 3D Scene\n Understanding and Generation","summary":" Driving World Models (DWMs) have become essential for autonomous driving by\nenabling future scene prediction. However, existing DWMs are limited to scene\ngeneration and fail to incorporate scene understanding, which involves\ninterpreting and reasoning about the driving environment. In this paper, we\npresent a unified Driving World Model named HERMES. We seamlessly integrate 3D\nscene understanding and future scene evolution (generation) through a unified\nframework in driving scenarios. Specifically, HERMES leverages a Bird's-Eye\nView (BEV) representation to consolidate multi-view spatial information while\npreserving geometric relationships and interactions. We also introduce world\nqueries, which incorporate world knowledge into BEV features via causal\nattention in the Large Language Model (LLM), enabling contextual enrichment for\nunderstanding and generation tasks. We conduct comprehensive studies on\nnuScenes and OmniDrive-nuScenes datasets to validate the effectiveness of our\nmethod. HERMES achieves state-of-the-art performance, reducing generation error\nby 32.4% and improving understanding metrics such as CIDEr by 8.0%. The model\nand code will be publicly released at https://github.com/LMD0311/HERMES.\n","authors":["Xin Zhou","Dingkang Liang","Sifan Tu","Xiwu Chen","Yikang Ding","Dingyuan Zhang","Feiyang Tan","Hengshuang Zhao","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2501.14729v1.pdf","comment":"Work in progress. The code will be available at\n https://github.com/LMD0311/HERMES"},{"id":"http://arxiv.org/abs/2501.14728v1","updated":"2025-01-24T18:59:31Z","published":"2025-01-24T18:59:31Z","title":"Mitigating GenAI-powered Evidence Pollution for Out-of-Context\n Multimodal Misinformation Detection","summary":" While large generative artificial intelligence (GenAI) models have achieved\nsignificant success, they also raise growing concerns about online information\nsecurity due to their potential misuse for generating deceptive content.\nOut-of-context (OOC) multimodal misinformation detection, which often retrieves\nWeb evidence to identify the repurposing of images in false contexts, faces the\nissue of reasoning over GenAI-polluted evidence to derive accurate predictions.\nExisting works simulate GenAI-powered pollution at the claim level with\nstylistic rewriting to conceal linguistic cues, and ignore evidence-level\npollution for such information-seeking applications. In this work, we\ninvestigate how polluted evidence affects the performance of existing OOC\ndetectors, revealing a performance degradation of more than 9 percentage\npoints. We propose two strategies, cross-modal evidence reranking and\ncross-modal claim-evidence reasoning, to address the challenges posed by\npolluted evidence. Extensive experiments on two benchmark datasets show that\nthese strategies can effectively enhance the robustness of existing\nout-of-context detectors amidst polluted evidence.\n","authors":["Zehong Yan","Peng Qi","Wynne Hsu","Mong Li Lee"],"pdf_url":"https://arxiv.org/pdf/2501.14728v1.pdf","comment":"12 pages, 11 figures"},{"id":"http://arxiv.org/abs/2501.14726v1","updated":"2025-01-24T18:59:15Z","published":"2025-01-24T18:59:15Z","title":"Relightable Full-Body Gaussian Codec Avatars","summary":" We propose Relightable Full-Body Gaussian Codec Avatars, a new approach for\nmodeling relightable full-body avatars with fine-grained details including face\nand hands. The unique challenge for relighting full-body avatars lies in the\nlarge deformations caused by body articulation and the resulting impact on\nappearance caused by light transport. Changes in body pose can dramatically\nchange the orientation of body surfaces with respect to lights, resulting in\nboth local appearance changes due to changes in local light transport\nfunctions, as well as non-local changes due to occlusion between body parts. To\naddress this, we decompose the light transport into local and non-local\neffects. Local appearance changes are modeled using learnable zonal harmonics\nfor diffuse radiance transfer. Unlike spherical harmonics, zonal harmonics are\nhighly efficient to rotate under articulation. This allows us to learn diffuse\nradiance transfer in a local coordinate frame, which disentangles the local\nradiance transfer from the articulation of the body. To account for non-local\nappearance changes, we introduce a shadow network that predicts shadows given\nprecomputed incoming irradiance on a base mesh. This facilitates the learning\nof non-local shadowing between the body parts. Finally, we use a deferred\nshading approach to model specular radiance transfer and better capture\nreflections and highlights such as eye glints. We demonstrate that our approach\nsuccessfully models both the local and non-local light transport required for\nrelightable full-body avatars, with a superior generalization ability under\nnovel illumination conditions and unseen poses.\n","authors":["Shaofei Wang","Tomas Simon","Igor Santesteban","Timur Bagautdinov","Junxuan Li","Vasu Agrawal","Fabian Prada","Shoou-I Yu","Pace Nalbone","Matt Gramlich","Roman Lubachersky","Chenglei Wu","Javier Romero","Jason Saragih","Michael Zollhoefer","Andreas Geiger","Siyu Tang","Shunsuke Saito"],"pdf_url":"https://arxiv.org/pdf/2501.14726v1.pdf","comment":"14 pages, 9 figures. Project page:\n https://neuralbodies.github.io/RFGCA"},{"id":"http://arxiv.org/abs/2501.14709v1","updated":"2025-01-24T18:32:34Z","published":"2025-01-24T18:32:34Z","title":"Enhanced Confocal Laser Scanning Microscopy with Adaptive Physics\n Informed Deep Autoencoders","summary":" We present a physics-informed deep learning framework to address common\nlimitations in Confocal Laser Scanning Microscopy (CLSM), such as diffraction\nlimited resolution, noise, and undersampling due to low laser power conditions.\nThe optical system's point spread function (PSF) and common CLSM image\ndegradation mechanisms namely photon shot noise, dark current noise, motion\nblur, speckle noise, and undersampling were modeled and were directly included\ninto model architecture. The model reconstructs high fidelity images from\nheavily noisy inputs by using convolutional and transposed convolutional\nlayers. Following the advances in compressed sensing, our approach\nsignificantly reduces data acquisition requirements without compromising image\nresolution. The proposed method was extensively evaluated on simulated CLSM\nimages of diverse structures, including lipid droplets, neuronal networks, and\nfibrillar systems. Comparisons with traditional deconvolution algorithms such\nas Richardson-Lucy (RL), non-negative least squares (NNLS), and other methods\nlike Total Variation (TV) regularization, Wiener filtering, and Wavelet\ndenoising demonstrate the superiority of the network in restoring fine\nstructural details with high fidelity. Assessment metrics like Structural\nSimilarity Index (SSIM) and Peak Signal to Noise Ratio (PSNR), underlines that\nthe AdaptivePhysicsAutoencoder achieved robust image enhancement across diverse\nCLSM conditions, helping faster acquisition, reduced photodamage, and reliable\nperformance in low light and sparse sampling scenarios holding promise for\napplications in live cell imaging, dynamic biological studies, and high\nthroughput material characterization.\n","authors":["Zaheer Ahmad","Junaid Shabeer","Usman Saleem","Tahir Qadeer","Abdul Sami","Zahira El Khalidi","Saad Mehmood"],"pdf_url":"https://arxiv.org/pdf/2501.14709v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14704v1","updated":"2025-01-24T18:29:34Z","published":"2025-01-24T18:29:34Z","title":"Stroke classification using Virtual Hybrid Edge Detection from in silico\n electrical impedance tomography data","summary":" Electrical impedance tomography (EIT) is a non-invasive imaging method for\nrecovering the internal conductivity of a physical body from electric boundary\nmeasurements. EIT combined with machine learning has shown promise for the\nclassification of strokes. However, most previous works have used raw EIT\nvoltage data as network inputs. We build upon a recent development which\nsuggested the use of special noise-robust Virtual Hybrid Edge Detection (VHED)\nfunctions as network inputs, although that work used only highly simplified and\nmathematically ideal models. In this work we strengthen the case for the use of\nEIT, and VHED functions especially, for stroke classification. We design models\nwith high detail and mathematical realism to test the use of VHED functions as\ninputs. Virtual patients are created using a physically detailed 2D head model\nwhich includes features known to create challenges in real-world imaging\nscenarios. Conductivity values are drawn from statistically realistic\ndistributions, and phantoms are afflicted with either hemorrhagic or ischemic\nstrokes of various shapes and sizes. Simulated noisy EIT electrode data,\ngenerated using the realistic Complete Electrode Model (CEM) as opposed to the\nmathematically ideal continuum model, is processed to obtain VHED functions. We\ncompare the use of VHED functions as inputs against the alternative paradigm of\nusing raw EIT voltages. Our results show that (i) stroke classification can be\nperformed with high accuracy using 2D EIT data from physically detailed and\nmathematically realistic models, and (ii) in the presence of noise, VHED\nfunctions outperform raw data as network inputs.\n","authors":["Juan Pablo Agnelli","Fernando S. Moura","Siiri Rautio","Melody Alsaker","Rashmi Murthy","Matti Lassas","Samuli Siltanen"],"pdf_url":"https://arxiv.org/pdf/2501.14704v1.pdf","comment":"21 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.14689v1","updated":"2025-01-24T18:02:32Z","published":"2025-01-24T18:02:32Z","title":"Approach to Designing CV Systems for Medical Applications: Data,\n Architecture and AI","summary":" This paper introduces an innovative software system for fundus image analysis\nthat deliberately diverges from the conventional screening approach, opting not\nto predict specific diagnoses. Instead, our methodology mimics the diagnostic\nprocess by thoroughly analyzing both normal and pathological features of fundus\nstructures, leaving the ultimate decision-making authority in the hands of\nhealthcare professionals. Our initiative addresses the need for objective\nclinical analysis and seeks to automate and enhance the clinical workflow of\nfundus image examination. The system, from its overarching architecture to the\nmodular analysis design powered by artificial intelligence (AI) models, aligns\nseamlessly with ophthalmological practices. Our unique approach utilizes a\ncombination of state-of-the-art deep learning methods and traditional computer\nvision algorithms to provide a comprehensive and nuanced analysis of fundus\nstructures. We present a distinctive methodology for designing medical\napplications, using our system as an illustrative example. Comprehensive\nverification and validation results demonstrate the efficacy of our approach in\nrevolutionizing fundus image analysis, with potential applications across\nvarious medical domains.\n","authors":["Dmitry Ryabtsev","Boris Vasilyev","Sergey Shershakov"],"pdf_url":"https://arxiv.org/pdf/2501.14689v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.14685v1","updated":"2025-01-24T18:01:07Z","published":"2025-01-24T18:01:07Z","title":"Rethinking Foundation Models for Medical Image Classification through a\n Benchmark Study on MedMNIST","summary":" Foundation models are widely employed in medical image analysis, due to their\nhigh adaptability and generalizability for downstream tasks. With the\nincreasing number of foundation models being released, model selection has\nbecome an important issue. In this work, we study the capabilities of\nfoundation models in medical image classification tasks by conducting a\nbenchmark study on the MedMNIST dataset. Specifically, we adopt various\nfoundation models ranging from convolutional to Transformer-based models and\nimplement both end-to-end training and linear probing for all classification\ntasks. The results demonstrate the significant potential of these pre-trained\nmodels when transferred for medical image classification. We further conduct\nexperiments with different image sizes and various sizes of training data. By\nanalyzing all the results, we provide preliminary, yet useful insights and\nconclusions on this topic.\n","authors":["Fuping Wu","Bartlomiej W. Papiez"],"pdf_url":"https://arxiv.org/pdf/2501.14685v1.pdf","comment":"submitted to MIDL2025"},{"id":"http://arxiv.org/abs/2501.14679v1","updated":"2025-01-24T17:57:06Z","published":"2025-01-24T17:57:06Z","title":"Surface Vision Mamba: Leveraging Bidirectional State Space Model for\n Efficient Spherical Manifold Representation","summary":" Attention-based methods have demonstrated exceptional performance in\nmodelling long-range dependencies on spherical cortical surfaces, surpassing\ntraditional Geometric Deep Learning (GDL) models. However, their extensive\ninference time and high memory demands pose challenges for application to large\ndatasets with limited computing resources. Inspired by the state space model in\ncomputer vision, we introduce the attention-free Vision Mamba (Vim) to\nspherical surfaces, presenting a domain-agnostic architecture for analyzing\ndata on spherical manifolds. Our method achieves surface patching by\nrepresenting spherical data as a sequence of triangular patches derived from a\nsubdivided icosphere. The proposed Surface Vision Mamba (SiM) is evaluated on\nmultiple neurodevelopmental phenotype regression tasks using cortical surface\nmetrics from neonatal brains. Experimental results demonstrate that SiM\noutperforms both attention- and GDL-based methods, delivering 4.8 times faster\ninference and achieving 91.7% lower memory consumption compared to the Surface\nVision Transformer (SiT) under the Ico-4 grid partitioning. Sensitivity\nanalysis further underscores the potential of SiM to identify subtle cognitive\ndevelopmental patterns. The code is available at\nhttps://github.com/Rongzhao-He/surface-vision-mamba.\n","authors":["Rongzhao He","Weihao Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.14679v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14677v1","updated":"2025-01-24T17:56:24Z","published":"2025-01-24T17:56:24Z","title":"MatAnyone: Stable Video Matting with Consistent Memory Propagation","summary":" Auxiliary-free human video matting methods, which rely solely on input\nframes, often struggle with complex or ambiguous backgrounds. To address this,\nwe propose MatAnyone, a robust framework tailored for target-assigned video\nmatting. Specifically, building on a memory-based paradigm, we introduce a\nconsistent memory propagation module via region-adaptive memory fusion, which\nadaptively integrates memory from the previous frame. This ensures semantic\nstability in core regions while preserving fine-grained details along object\nboundaries. For robust training, we present a larger, high-quality, and diverse\ndataset for video matting. Additionally, we incorporate a novel training\nstrategy that efficiently leverages large-scale segmentation data, boosting\nmatting stability. With this new network design, dataset, and training\nstrategy, MatAnyone delivers robust and accurate video matting results in\ndiverse real-world scenarios, outperforming existing methods.\n","authors":["Peiqing Yang","Shangchen Zhou","Jixin Zhao","Qingyi Tao","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2501.14677v1.pdf","comment":"Project page: https://pq-yang.github.io/projects/MatAnyone"},{"id":"http://arxiv.org/abs/2412.17640v2","updated":"2025-01-24T17:43:56Z","published":"2024-12-23T15:18:24Z","title":"Hierarchical Vector Quantization for Unsupervised Action Segmentation","summary":" In this work, we address unsupervised temporal action segmentation, which\nsegments a set of long, untrimmed videos into semantically meaningful segments\nthat are consistent across videos. While recent approaches combine\nrepresentation learning and clustering in a single step for this task, they do\nnot cope with large variations within temporal segments of the same class. To\naddress this limitation, we propose a novel method, termed Hierarchical Vector\nQuantization (HVQ), that consists of two subsequent vector quantization\nmodules. This results in a hierarchical clustering where the additional\nsubclusters cover the variations within a cluster. We demonstrate that our\napproach captures the distribution of segment lengths much better than the\nstate of the art. To this end, we introduce a new metric based on the\nJensen-Shannon Distance (JSD) for unsupervised temporal action segmentation. We\nevaluate our approach on three public datasets, namely Breakfast, YouTube\nInstructional and IKEA ASM. Our approach outperforms the state of the art in\nterms of F1 score, recall and JSD.\n","authors":["Federico Spurio","Emad Bahrami","Gianpiero Francesca","Juergen Gall"],"pdf_url":"https://arxiv.org/pdf/2412.17640v2.pdf","comment":"To be published in Conference on Artificial Intelligence (AAAI) 2025"},{"id":"http://arxiv.org/abs/2501.14659v1","updated":"2025-01-24T17:29:17Z","published":"2025-01-24T17:29:17Z","title":"Towards Unified Structured Light Optimization","summary":" Structured light (SL) 3D reconstruction captures the precise surface shape of\nobjects, providing high-accuracy 3D data essential for industrial inspection\nand robotic vision systems. However, current research on optimizing projection\npatterns in SL 3D reconstruction faces two main limitations: each scene\nrequires separate training of calibration parameters, and optimization is\nrestricted to specific types of SL, which restricts their application range. To\ntackle these limitations, we present a unified framework for SL optimization,\nadaptable to diverse lighting conditions, object types, and different types of\nSL. Our framework quickly determines the optimal projection pattern using only\na single projected image. Key contributions include a novel global matching\nmethod for projectors, enabling precise projector-camera alignment with just\none projected image, and a new projection compensation model with a photometric\nadjustment module to reduce artifacts from out-of-gamut clipping. Experimental\nresults show our method achieves superior decoding accuracy across various\nobjects, SL patterns, and lighting conditions, significantly outperforming\nprevious methods.\n","authors":["Tinglei Wan","Tonghua Su","Zhongjie Wang"],"pdf_url":"https://arxiv.org/pdf/2501.14659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18592v2","updated":"2025-01-24T17:21:45Z","published":"2024-09-27T09:51:45Z","title":"From One to the Power of Many: Invariance to Multi-LiDAR Perception from\n Single-Sensor Datasets","summary":" Recently, LiDAR segmentation methods for autonomous vehicles, powered by deep\nneural networks, have experienced steep growth in performance on classic\nbenchmarks, such as nuScenes and SemanticKITTI. However, there are still large\ngaps in performance when deploying models trained on such single-sensor setups\nto modern vehicles with multiple high-resolution LiDAR sensors. In this work,\nwe introduce a new metric for feature-level invariance which can serve as a\nproxy to measure cross-domain generalization without requiring labeled data.\nAdditionally, we propose two application-specific data augmentations, which\nfacilitate better transfer to multi-sensor LiDAR setups, when trained on\nsingle-sensor datasets. We provide experimental evidence on both simulated and\nreal data, that our proposed augmentations improve invariance across LiDAR\nsetups, leading to improved generalization.\n","authors":["Marc Uecker","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2409.18592v2.pdf","comment":"Accepted for publication at the ML4AD Workshop @ AAAI Conference 2025"},{"id":"http://arxiv.org/abs/2501.14646v1","updated":"2025-01-24T17:14:25Z","published":"2025-01-24T17:14:25Z","title":"SyncAnimation: A Real-Time End-to-End Framework for Audio-Driven Human\n Pose and Talking Head Animation","summary":" Generating talking avatar driven by audio remains a significant challenge.\nExisting methods typically require high computational costs and often lack\nsufficient facial detail and realism, making them unsuitable for applications\nthat demand high real-time performance and visual quality. Additionally, while\nsome methods can synchronize lip movement, they still face issues with\nconsistency between facial expressions and upper body movement, particularly\nduring silent periods. In this paper, we introduce SyncAnimation, the first\nNeRF-based method that achieves audio-driven, stable, and real-time generation\nof speaking avatar by combining generalized audio-to-pose matching and\naudio-to-expression synchronization. By integrating AudioPose Syncer and\nAudioEmotion Syncer, SyncAnimation achieves high-precision poses and expression\ngeneration, progressively producing audio-synchronized upper body, head, and\nlip shapes. Furthermore, the High-Synchronization Human Renderer ensures\nseamless integration of the head and upper body, and achieves audio-sync lip.\nThe project page can be found at https://syncanimation.github.io\n","authors":["Yujian Liu","Shidang Xu","Jing Guo","Dingbin Wang","Zairan Wang","Xianfeng Tan","Xiaoli Liu"],"pdf_url":"https://arxiv.org/pdf/2501.14646v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.07613v3","updated":"2025-01-24T17:06:30Z","published":"2024-09-11T20:50:41Z","title":"Token Turing Machines are Efficient Vision Models","summary":" We propose Vision Token Turing Machines (ViTTM), an efficient, low-latency,\nmemory-augmented Vision Transformer (ViT). Our approach builds on Neural Turing\nMachines and Token Turing Machines, which were applied to NLP and sequential\nvisual understanding tasks. ViTTMs are designed for non-sequential computer\nvision tasks such as image classification and segmentation. Our model creates\ntwo sets of tokens: process tokens and memory tokens; process tokens pass\nthrough encoder blocks and read-write from memory tokens at each encoder block\nin the network, allowing them to store and retrieve information from memory. By\nensuring that there are fewer process tokens than memory tokens, we are able to\nreduce the inference time of the network while maintaining its accuracy. On\nImageNet-1K, the state-of-the-art ViT-B has median latency of 529.5ms and 81.0%\naccuracy, while our ViTTM-B is 56% faster (234.1ms), with 2.4 times fewer\nFLOPs, with an accuracy of 82.9%. On ADE20K semantic segmentation, ViT-B\nachieves 45.65mIoU at 13.8 frame-per-second (FPS) whereas our ViTTM-B model\nacheives a 45.17 mIoU with 26.8 FPS (+94%).\n","authors":["Purvish Jajal","Nick John Eliopoulos","Benjamin Shiue-Hal Chou","George K. Thiruvathukal","James C. Davis","Yung-Hsiang Lu"],"pdf_url":"https://arxiv.org/pdf/2409.07613v3.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2501.14607v1","updated":"2025-01-24T16:24:15Z","published":"2025-01-24T16:24:15Z","title":"ReferDINO: Referring Video Object Segmentation with Visual Grounding\n Foundations","summary":" Referring video object segmentation (RVOS) aims to segment target objects\nthroughout a video based on a text description. Despite notable progress in\nrecent years, current RVOS models remain struggle to handle complicated object\ndescriptions due to their limited video-language understanding. To address this\nlimitation, we present \\textbf{ReferDINO}, an end-to-end RVOS model that\ninherits strong vision-language understanding from the pretrained visual\ngrounding foundation models, and is further endowed with effective temporal\nunderstanding and object segmentation capabilities. In ReferDINO, we contribute\nthree technical innovations for effectively adapting the foundation models to\nRVOS: 1) an object-consistent temporal enhancer that capitalizes on the\npretrained object-text representations to enhance temporal understanding and\nobject consistency; 2) a grounding-guided deformable mask decoder that\nintegrates text and grounding conditions to generate accurate object masks; 3)\na confidence-aware query pruning strategy that significantly improves the\nobject decoding efficiency without compromising performance. We conduct\nextensive experiments on five public RVOS benchmarks to demonstrate that our\nproposed ReferDINO outperforms state-of-the-art methods significantly. Project\npage: \\url{https://isee-laboratory.github.io/ReferDINO}\n","authors":["Tianming Liang","Kun-Yu Lin","Chaolei Tan","Jianguo Zhang","Wei-Shi Zheng","Jian-Fang Hu"],"pdf_url":"https://arxiv.org/pdf/2501.14607v1.pdf","comment":"Project page: https://isee-laboratory.github.io/ReferDINO"},{"id":"http://arxiv.org/abs/2501.14605v1","updated":"2025-01-24T16:22:35Z","published":"2025-01-24T16:22:35Z","title":"3DLabelProp: Geometric-Driven Domain Generalization for LiDAR Semantic\n Segmentation in Autonomous Driving","summary":" Domain generalization aims to find ways for deep learning models to maintain\ntheir performance despite significant domain shifts between training and\ninference datasets. This is particularly important for models that need to be\nrobust or are costly to train. LiDAR perception in autonomous driving is\nimpacted by both of these concerns, leading to the emergence of various\napproaches. This work addresses the challenge by proposing a geometry-based\napproach, leveraging the sequential structure of LiDAR sensors, which sets it\napart from the learning-based methods commonly found in the literature. The\nproposed method, called 3DLabelProp, is applied on the task of LiDAR Semantic\nSegmentation (LSS). Through extensive experimentation on seven datasets, it is\ndemonstrated to be a state-of-the-art approach, outperforming both naive and\nother domain generalization methods.\n","authors":["Jules Sanchez","Jean-Emmanuel Deschaud","François Goulette"],"pdf_url":"https://arxiv.org/pdf/2501.14605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.19289v3","updated":"2025-01-24T16:16:52Z","published":"2024-12-26T17:29:38Z","title":"ViPCap: Retrieval Text-Based Visual Prompts for Lightweight Image\n Captioning","summary":" Recent lightweight image captioning models using retrieved data mainly focus\non text prompts. However, previous works only utilize the retrieved text as\ntext prompts, and the visual information relies only on the CLIP visual\nembedding. Because of this issue, there is a limitation that the image\ndescriptions inherent in the prompt are not sufficiently reflected in the\nvisual embedding space. To tackle this issue, we propose ViPCap, a novel\nretrieval text-based visual prompt for lightweight image captioning. ViPCap\nleverages the retrieved text with image information as visual prompts to\nenhance the ability of the model to capture relevant visual information. By\nmapping text prompts into the CLIP space and generating multiple randomized\nGaussian distributions, our method leverages sampling to explore randomly\naugmented distributions and effectively retrieves the semantic features that\ncontain image information. These retrieved features are integrated into the\nimage and designated as the visual prompt, leading to performance improvements\non the datasets such as COCO, Flickr30k, and NoCaps. Experimental results\ndemonstrate that ViPCap significantly outperforms prior lightweight captioning\nmodels in efficiency and effectiveness, demonstrating the potential for a\nplug-and-play solution. The source code is available at\nhttps://github.com/taewhankim/VIPCAP.\n","authors":["Taewhan Kim","Soeun Lee","Si-Woo Kim","Dong-Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2412.19289v3.pdf","comment":"Accepted to AAAI 2025"},{"id":"http://arxiv.org/abs/2501.14593v1","updated":"2025-01-24T15:56:55Z","published":"2025-01-24T15:56:55Z","title":"Geometric Mean Improves Loss For Few-Shot Learning","summary":" Few-shot learning (FSL) is a challenging task in machine learning, demanding\na model to render discriminative classification by using only a few labeled\nsamples. In the literature of FSL, deep models are trained in a manner of\nmetric learning to provide metric in a feature space which is well\ngeneralizable to classify samples of novel classes; in the space, even a few\namount of labeled training examples can construct an effective classifier. In\nthis paper, we propose a novel FSL loss based on \\emph{geometric mean} to embed\ndiscriminative metric into deep features. In contrast to the other losses such\nas utilizing arithmetic mean in softmax-based formulation, the proposed method\nleverages geometric mean to aggregate pair-wise relationships among samples for\nenhancing discriminative metric across class categories. The proposed loss is\nnot only formulated in a simple form but also is thoroughly analyzed in\ntheoretical ways to reveal its favorable characteristics which are favorable\nfor learning feature metric in FSL. In the experiments on few-shot image\nclassification tasks, the method produces competitive performance in comparison\nto the other losses.\n","authors":["Tong Wu","Takumi Kobayashi"],"pdf_url":"https://arxiv.org/pdf/2501.14593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14592v1","updated":"2025-01-24T15:54:51Z","published":"2025-01-24T15:54:51Z","title":"Improved Vessel Segmentation with Symmetric Rotation-Equivariant U-Net","summary":" Automated segmentation plays a pivotal role in medical image analysis and\ncomputer-assisted interventions. Despite the promising performance of existing\nmethods based on convolutional neural networks (CNNs), they neglect useful\nequivariant properties for images, such as rotational and reflection\nequivariance. This limitation can decrease performance and lead to inconsistent\npredictions, especially in applications like vessel segmentation where explicit\norientation is absent. While existing equivariant learning approaches attempt\nto mitigate these issues, they substantially increase learning cost, model\nsize, or both. To overcome these challenges, we propose a novel application of\nan efficient symmetric rotation-equivariant (SRE) convolutional (SRE-Conv)\nkernel implementation to the U-Net architecture, to learn rotation and\nreflection-equivariant features, while also reducing the model size\ndramatically. We validate the effectiveness of our method through improved\nsegmentation performance on retina vessel fundus imaging. Our proposed SRE\nU-Net not only significantly surpasses standard U-Net in handling rotated\nimages, but also outperforms existing equivariant learning methods and does so\nwith a reduced number of trainable parameters and smaller memory cost. The code\nis available at https://github.com/OnofreyLab/sre_conv_segm_isbi2025.\n","authors":["Jiazhen Zhang","Yuexi Du","Nicha C. Dvornek","John A. Onofrey"],"pdf_url":"https://arxiv.org/pdf/2501.14592v1.pdf","comment":"Accepted by IEEE ISBI 2025"},{"id":"http://arxiv.org/abs/2501.14587v1","updated":"2025-01-24T15:48:41Z","published":"2025-01-24T15:48:41Z","title":"Visual Localization via Semantic Structures in Autonomous Photovoltaic\n Power Plant Inspection","summary":" Inspection systems utilizing unmanned aerial vehicles (UAVs) equipped with\nthermal cameras are increasingly popular for the maintenance of photovoltaic\n(PV) power plants. However, automation of the inspection task is a challenging\nproblem as it requires precise navigation to capture images from optimal\ndistances and viewing angles.\n This paper presents a novel localization pipeline that directly integrates PV\nmodule detection with UAV navigation, allowing precise positioning during\ninspection. Detections are used to identify the power plant structures in the\nimage and associate these with the power plant model. We define visually\nrecognizable anchor points for the initial association and use object tracking\nto discern global associations. We present three distinct methods for visual\nsegmentation of PV modules based on traditional computer vision, deep learning,\nand their fusion, and we evaluate their performance in relation to the proposed\nlocalization pipeline.\n The presented methods were verified and evaluated using custom aerial\ninspection data sets, demonstrating their robustness and applicability for\nreal-time navigation. Additionally, we evaluate the influence of the power\nplant model's precision on the localization methods.\n","authors":["Viktor Kozák","Karel Košnar","Jan Chudoba","Miroslav Kulich","Libor Přeučil"],"pdf_url":"https://arxiv.org/pdf/2501.14587v1.pdf","comment":"47 pages, 22 figures"},{"id":"http://arxiv.org/abs/2411.07072v2","updated":"2025-01-24T15:26:34Z","published":"2024-11-11T15:47:25Z","title":"An Interpretable X-ray Style Transfer via Trainable Local Laplacian\n Filter","summary":" Radiologists have preferred visual impressions or 'styles' of X-ray images\nthat are manually adjusted to their needs to support their diagnostic\nperformance. In this work, we propose an automatic and interpretable X-ray\nstyle transfer by introducing a trainable version of the Local Laplacian Filter\n(LLF). From the shape of the LLF's optimized remap function, the\ncharacteristics of the style transfer can be inferred and reliability of the\nalgorithm can be ensured. Moreover, we enable the LLF to capture complex X-ray\nstyle features by replacing the remap function with a Multi-Layer Perceptron\n(MLP) and adding a trainable normalization layer. We demonstrate the\neffectiveness of the proposed method by transforming unprocessed mammographic\nX-ray images into images that match the style of target mammograms and achieve\na Structural Similarity Index (SSIM) of 0.94 compared to 0.82 of the baseline\nLLF style transfer method from Aubry et al.\n","authors":["Dominik Eckert","Ludwig Ritschl","Christopher Syben","Christian Hümmer","Julia Wicklein","Marcel Beister","Steffen Kappler","Sebastian Stober"],"pdf_url":"https://arxiv.org/pdf/2411.07072v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09018v2","updated":"2025-01-24T15:12:58Z","published":"2024-11-13T20:50:04Z","title":"Bridging the Visual Gap: Fine-Tuning Multimodal Models with\n Knowledge-Adapted Captions","summary":" Recent research increasingly focuses on training vision-language models\n(VLMs) with long, detailed image captions. However, small-scale VLMs often\nstruggle to balance the richness of these captions with the risk of\nhallucinating content during fine-tuning. In this paper, we explore how well\nVLMs adapt to such captions. To quantify caption quality, we propose Decomposed\nNLI (DNLI), an evaluation framework that breaks down generated captions into\nindividual propositions, assessing each in isolation. This fine-grained\nanalysis reveals a critical balance between capturing descriptive details and\npreventing hallucinations. Our findings show that simply reducing caption\ncomplexity or employing standard data curation techniques does not effectively\nresolve this issue. To tackle this challenge, we introduce Knowledge Adapted\n(KnowAda) fine-tuning, a data-centric approach that automatically adapts\ntraining data with the model's existing knowledge and visual understanding.\nKnowAda minimizes hallucinations while preserving high descriptiveness. We\nvalidate this approach across several small-scale VLMs (up to 7B parameters)\nand dense caption datasets, demonstrating that KnowAda effectively balances\nhallucination reduction and descriptiveness. Our results show that KnowAda\noutperforms various baselines in both automatic metrics and human evaluations.\nWe will release our code and models.\n","authors":["Moran Yanuka","Assaf Ben Kish","Yonatan Bitton","Idan Szpektor","Raja Giryes"],"pdf_url":"https://arxiv.org/pdf/2411.09018v2.pdf","comment":"Accepted to NAACL 2025"},{"id":"http://arxiv.org/abs/2501.14548v1","updated":"2025-01-24T14:50:48Z","published":"2025-01-24T14:50:48Z","title":"Large-scale and Fine-grained Vision-language Pre-training for Enhanced\n CT Image Understanding","summary":" Artificial intelligence (AI) shows great potential in assisting radiologists\nto improve the efficiency and accuracy of medical image interpretation and\ndiagnosis. However, a versatile AI model requires large-scale data and\ncomprehensive annotations, which are often impractical in medical settings.\nRecent studies leverage radiology reports as a naturally high-quality\nsupervision for medical images, using contrastive language-image pre-training\n(CLIP) to develop language-informed models for radiological image\ninterpretation. Nonetheless, these approaches typically contrast entire images\nwith reports, neglecting the local associations between imaging regions and\nreport sentences, which may undermine model performance and interoperability.\nIn this paper, we propose a fine-grained vision-language model (fVLM) for\nanatomy-level CT image interpretation. Specifically, we explicitly match\nanatomical regions of CT images with corresponding descriptions in radiology\nreports and perform contrastive pre-training for each anatomy individually.\nFine-grained alignment, however, faces considerable false-negative challenges,\nmainly from the abundance of anatomy-level healthy samples and similarly\ndiseased abnormalities. To tackle this issue, we propose identifying false\nnegatives of both normal and abnormal samples and calibrating contrastive\nlearning from patient-level to disease-aware pairing. We curated the largest CT\ndataset to date, comprising imaging and report data from 69,086 patients, and\nconducted a comprehensive evaluation of 54 major and important disease\ndiagnosis tasks across 15 main anatomies. Experimental results demonstrate the\nsubstantial potential of fVLM in versatile medical image interpretation. In the\nzero-shot classification task, we achieved an average AUC of 81.3% on 54\ndiagnosis tasks, surpassing CLIP and supervised methods by 12.9% and 8.0%,\nrespectively.\n","authors":["Zhongyi Shui","Jianpeng Zhang","Weiwei Cao","Sinuo Wang","Ruizhe Guo","Le Lu","Lin Yang","Xianghua Ye","Tingbo Liang","Qi Zhang","Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.14548v1.pdf","comment":"Accepted by ICLR 2025"},{"id":"http://arxiv.org/abs/2501.14546v1","updated":"2025-01-24T14:49:00Z","published":"2025-01-24T14:49:00Z","title":"Leveraging ChatGPT's Multimodal Vision Capabilities to Rank Satellite\n Images by Poverty Level: Advancing Tools for Social Science Research","summary":" This paper investigates the novel application of Large Language Models (LLMs)\nwith vision capabilities to analyze satellite imagery for village-level poverty\nprediction. Although LLMs were originally designed for natural language\nunderstanding, their adaptability to multimodal tasks, including geospatial\nanalysis, has opened new frontiers in data-driven research. By leveraging\nadvancements in vision-enabled LLMs, we assess their ability to provide\ninterpretable, scalable, and reliable insights into human poverty from\nsatellite images. Using a pairwise comparison approach, we demonstrate that\nChatGPT can rank satellite images based on poverty levels with accuracy\ncomparable to domain experts. These findings highlight both the promise and the\nlimitations of LLMs in socioeconomic research, providing a foundation for their\nintegration into poverty assessment workflows. This study contributes to the\nongoing exploration of unconventional data sources for welfare analysis and\nopens pathways for cost-effective, large-scale poverty monitoring.\n","authors":["Hamid Sarmadi","Ola Hall","Thorsteinn Rögnvaldsson","Mattias Ohlsson"],"pdf_url":"https://arxiv.org/pdf/2501.14546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14535v1","updated":"2025-01-24T14:41:30Z","published":"2025-01-24T14:41:30Z","title":"Rethinking Encoder-Decoder Flow Through Shared Structures","summary":" Dense prediction tasks have enjoyed a growing complexity of encoder\narchitectures, decoders, however, have remained largely the same. They rely on\nindividual blocks decoding intermediate feature maps sequentially. We introduce\nbanks, shared structures that are used by each decoding block to provide\nadditional context in the decoding process. These structures, through applying\nthem via resampling and feature fusion, improve performance on depth estimation\nfor state-of-the-art transformer-based architectures on natural and synthetic\nimages whilst training on large-scale datasets.\n","authors":["Frederik Laboyrie","Mehmet Kerim Yucel","Albert Saa-Garriga"],"pdf_url":"https://arxiv.org/pdf/2501.14535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14534v1","updated":"2025-01-24T14:40:40Z","published":"2025-01-24T14:40:40Z","title":"Trick-GS: A Balanced Bag of Tricks for Efficient Gaussian Splatting","summary":" Gaussian splatting (GS) for 3D reconstruction has become quite popular due to\ntheir fast training, inference speeds and high quality reconstruction. However,\nGS-based reconstructions generally consist of millions of Gaussians, which\nmakes them hard to use on computationally constrained devices such as\nsmartphones. In this paper, we first propose a principled analysis of advances\nin efficient GS methods. Then, we propose Trick-GS, which is a careful\ncombination of several strategies including (1) progressive training with\nresolution, noise and Gaussian scales, (2) learning to prune and mask\nprimitives and SH bands by their significance, and (3) accelerated GS training\nframework. Trick-GS takes a large step towards resource-constrained GS, where\nfaster run-time, smaller and faster-convergence of models is of paramount\nconcern. Our results on three datasets show that Trick-GS achieves up to 2x\nfaster training, 40x smaller disk size and 2x faster rendering speed compared\nto vanilla GS, while having comparable accuracy.\n","authors":["Anil Armagan","Albert Saà-Garriga","Bruno Manganelli","Mateusz Nowak","Mehmet Kerim Yucel"],"pdf_url":"https://arxiv.org/pdf/2501.14534v1.pdf","comment":"Accepted at ICASSP'25"},{"id":"http://arxiv.org/abs/2501.14533v1","updated":"2025-01-24T14:40:39Z","published":"2025-01-24T14:40:39Z","title":"CheapNVS: Real-Time On-Device Narrow-Baseline Novel View Synthesis","summary":" Single-view novel view synthesis (NVS) is a notorious problem due to its\nill-posed nature, and often requires large, computationally expensive\napproaches to produce tangible results. In this paper, we propose CheapNVS: a\nfully end-to-end approach for narrow baseline single-view NVS based on a novel,\nefficient multiple encoder/decoder design trained in a multi-stage fashion.\nCheapNVS first approximates the laborious 3D image warping with lightweight\nlearnable modules that are conditioned on the camera pose embeddings of the\ntarget view, and then performs inpainting on the occluded regions in parallel\nto achieve significant performance gains. Once trained on a subset of Open\nImages dataset, CheapNVS outperforms the state-of-the-art despite being 10\ntimes faster and consuming 6% less memory. Furthermore, CheapNVS runs\ncomfortably in real-time on mobile devices, reaching over 30 FPS on a Samsung\nTab 9+.\n","authors":["Konstantinos Georgiadis","Mehmet Kerim Yucel","Albert Saa-Garriga"],"pdf_url":"https://arxiv.org/pdf/2501.14533v1.pdf","comment":"Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.14524v1","updated":"2025-01-24T14:27:12Z","published":"2025-01-24T14:27:12Z","title":"Training-Free Style and Content Transfer by Leveraging U-Net Skip\n Connections in Stable Diffusion 2.*","summary":" Despite significant recent advances in image generation with diffusion\nmodels, their internal latent representations remain poorly understood.\nExisting works focus on the bottleneck layer (h-space) of Stable Diffusion's\nU-Net or leverage the cross-attention, self-attention, or decoding layers. Our\nmodel, SkipInject takes advantage of U-Net's skip connections. We conduct\nthorough analyses on the role of the skip connections and find that the\nresidual connections passed by the third encoder block carry most of the\nspatial information of the reconstructed image, splitting the content from the\nstyle. We show that injecting the representations from this block can be used\nfor text-based editing, precise modifications, and style transfer. We compare\nour methods state-of-the-art style transfer and image editing methods and\ndemonstrate that our method obtains the best content alignment and optimal\nstructural preservation tradeoff.\n","authors":["Ludovica Schaerf","Andrea Alfarano","Fabrizio Silvestri","Leonardo Impett"],"pdf_url":"https://arxiv.org/pdf/2501.14524v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14520v1","updated":"2025-01-24T14:23:31Z","published":"2025-01-24T14:23:31Z","title":"Scene Understanding Enabled Semantic Communication with Open Channel\n Coding","summary":" As communication systems transition from symbol transmission to conveying\nmeaningful information, sixth-generation (6G) networks emphasize semantic\ncommunication. This approach prioritizes high-level semantic information,\nimproving robustness and reducing redundancy across modalities like text,\nspeech, and images. However, traditional semantic communication faces\nlimitations, including static coding strategies, poor generalization, and\nreliance on task-specific knowledge bases that hinder adaptability. To overcome\nthese challenges, we propose a novel system combining scene understanding,\nLarge Language Models (LLMs), and open channel coding, named \\textbf{OpenSC}.\nTraditional systems rely on fixed domain-specific knowledge bases, limiting\ntheir ability to generalize. Our open channel coding approach leverages shared,\npublicly available knowledge, enabling flexible, adaptive encoding. This\ndynamic system reduces reliance on static task-specific data, enhancing\nadaptability across diverse tasks and environments. Additionally, we use scene\ngraphs for structured semantic encoding, capturing object relationships and\ncontext to improve tasks like Visual Question Answering (VQA). Our approach\nselectively encodes key semantic elements, minimizing redundancy and improving\ntransmission efficiency. Experimental results show significant improvements in\nboth semantic understanding and efficiency, advancing the potential of\nadaptive, generalizable semantic communication in 6G networks.\n","authors":["Zhe Xiang","Fei Yu","Quan Deng","Yuandi Li","Zhiguo Wan"],"pdf_url":"https://arxiv.org/pdf/2501.14520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14514v1","updated":"2025-01-24T14:18:40Z","published":"2025-01-24T14:18:40Z","title":"PARASIDE: An Automatic Paranasal Sinus Segmentation and Structure\n Analysis Tool for MRI","summary":" Chronic rhinosinusitis (CRS) is a common and persistent sinus imflammation\nthat affects 5 - 12\\% of the general population. It significantly impacts\nquality of life and is often difficult to assess due to its subjective nature\nin clinical evaluation. We introduce PARASIDE, an automatic tool for segmenting\nair and soft tissue volumes of the structures of the sinus maxillaris,\nfrontalis, sphenodalis and ethmoidalis in T1 MRI. By utilizing that\nsegmentation, we can quantify feature relations that have been observed only\nmanually and subjectively before. We performed an exemplary study and showed\nboth volume and intensity relations between structures and radiology reports.\nWhile the soft tissue segmentation is good, the automated annotations of the\nair volumes are excellent. The average intensity over air structures are\nconsistently below those of the soft tissues, close to perfect separability.\nHealthy subjects exhibit lower soft tissue volumes and lower intensities. Our\ndeveloped system is the first automated whole nasal segmentation of 16\nstructures, and capable of calculating medical relevant features such as the\nLund-Mackay score.\n","authors":["Hendrik Möller","Lukas Krautschick","Matan Atad","Robert Graf","Chia-Jung Busch","Achim Beule","Christian Scharf","Lars Kaderali","Bjoern Menze","Daniel Rueckert","Jan Kirschke","Fabian Schwitzing"],"pdf_url":"https://arxiv.org/pdf/2501.14514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14510v1","updated":"2025-01-24T14:12:04Z","published":"2025-01-24T14:12:04Z","title":"Deep-BrownConrady: Prediction of Camera Calibration and Distortion\n Parameters Using Deep Learning and Synthetic Data","summary":" This research addresses the challenge of camera calibration and distortion\nparameter prediction from a single image using deep learning models. The main\ncontributions of this work are: (1) demonstrating that a deep learning model,\ntrained on a mix of real and synthetic images, can accurately predict camera\nand lens parameters from a single image, and (2) developing a comprehensive\nsynthetic dataset using the AILiveSim simulation platform. This dataset\nincludes variations in focal length and lens distortion parameters, providing a\nrobust foundation for model training and testing. The training process\npredominantly relied on these synthetic images, complemented by a small subset\nof real images, to explore how well models trained on synthetic data can\nperform calibration tasks on real-world images. Traditional calibration methods\nrequire multiple images of a calibration object from various orientations,\nwhich is often not feasible due to the lack of such images in publicly\navailable datasets. A deep learning network based on the ResNet architecture\nwas trained on this synthetic dataset to predict camera calibration parameters\nfollowing the Brown-Conrady lens model. The ResNet architecture, adapted for\nregression tasks, is capable of predicting continuous values essential for\naccurate camera calibration in applications such as autonomous driving,\nrobotics, and augmented reality.\n Keywords: Camera calibration, distortion, synthetic data, deep learning,\nresidual networks (ResNet), AILiveSim, horizontal field-of-view, principal\npoint, Brown-Conrady Model.\n","authors":["Faiz Muhammad Chaudhry","Jarno Ralli","Jerome Leudet","Fahad Sohrab","Farhad Pakdaman","Pierre Corbani","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2501.14510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14502v1","updated":"2025-01-24T14:01:51Z","published":"2025-01-24T14:01:51Z","title":"LiDAR-Based Vehicle Detection and Tracking for Autonomous Racing","summary":" Autonomous racing provides a controlled environment for testing the software\nand hardware of autonomous vehicles operating at their performance limits.\nCompetitive interactions between multiple autonomous racecars however introduce\nchallenging and potentially dangerous scenarios. Accurate and consistent\nvehicle detection and tracking is crucial for overtaking maneuvers, and\nlow-latency sensor processing is essential to respond quickly to hazardous\nsituations. This paper presents the LiDAR-based perception algorithms deployed\non Team PoliMOVE's autonomous racecar, which won multiple competitions in the\nIndy Autonomous Challenge series. Our Vehicle Detection and Tracking pipeline\nis composed of a novel fast Point Cloud Segmentation technique and a specific\nVehicle Pose Estimation methodology, together with a variable-step Multi-Target\nTracking algorithm. Experimental results demonstrate the algorithm's\nperformance, robustness, computational efficiency, and suitability for\nautonomous racing applications, enabling fully autonomous overtaking maneuvers\nat velocities exceeding 275 km/h.\n","authors":["Marcello Cellina","Matteo Corno","Sergio Matteo Savaresi"],"pdf_url":"https://arxiv.org/pdf/2501.14502v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2406.18849v3","updated":"2025-01-24T13:58:49Z","published":"2024-06-27T02:40:35Z","title":"Dysca: A Dynamic and Scalable Benchmark for Evaluating Perception\n Ability of LVLMs","summary":" Currently many benchmarks have been proposed to evaluate the perception\nability of the Large Vision-Language Models (LVLMs). However, most benchmarks\nconduct questions by selecting images from existing datasets, resulting in the\npotential data leakage. Besides, these benchmarks merely focus on evaluating\nLVLMs on the realistic style images and clean scenarios, leaving the\nmulti-stylized images and noisy scenarios unexplored. In response to these\nchallenges, we propose a dynamic and scalable benchmark named Dysca for\nevaluating LVLMs by leveraging synthesis images. Specifically, we leverage\nStable Diffusion and design a rule-based method to dynamically generate novel\nimages, questions and the corresponding answers. We consider 51 kinds of image\nstyles and evaluate the perception capability in 20 subtasks. Moreover, we\nconduct evaluations under 4 scenarios (i.e., Clean, Corruption, Print Attacking\nand Adversarial Attacking) and 3 question types (i.e., Multi-choices,\nTrue-or-false and Free-form). Thanks to the generative paradigm, Dysca serves\nas a scalable benchmark for easily adding new subtasks and scenarios. A total\nof 24 advanced open-source LVLMs and 2 close-source LVLMs are evaluated on\nDysca, revealing the drawbacks of current LVLMs. The benchmark is released at\n\\url{https://github.com/Robin-WZQ/Dysca}.\n","authors":["Jie Zhang","Zhongqi Wang","Mengqi Lei","Zheng Yuan","Bei Yan","Shiguang Shan","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2406.18849v3.pdf","comment":"Accepted by ICLR2025"},{"id":"http://arxiv.org/abs/2411.12724v2","updated":"2025-01-24T13:54:43Z","published":"2024-11-19T18:45:16Z","title":"Heuristic-Free Multi-Teacher Learning","summary":" We introduce Teacher2Task, a novel framework for multi-teacher learning that\neliminates the need for manual aggregation heuristics. Existing multi-teacher\nmethods typically rely on such heuristics to combine predictions from multiple\nteachers, often resulting in sub-optimal aggregated labels and the propagation\nof aggregation errors. Teacher2Task addresses these limitations by introducing\nteacher-specific input tokens and reformulating the training process. Instead\nof relying on aggregated labels, the framework transforms the training data,\nconsisting of ground truth labels and annotations from N teachers, into N+1\ndistinct tasks: N auxiliary tasks that predict the labeling styles of the N\nindividual teachers, and one primary task that focuses on the ground truth\nlabels. This approach, drawing upon principles from multiple learning\nparadigms, demonstrates strong empirical results across a range of\narchitectures, modalities, and tasks.\n","authors":["Huy Thong Nguyen","En-Hung Chu","Lenord Melvix","Jazon Jiao","Chunglin Wen","Benjamin Louie"],"pdf_url":"https://arxiv.org/pdf/2411.12724v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14496v1","updated":"2025-01-24T13:52:37Z","published":"2025-01-24T13:52:37Z","title":"A Note on Implementation Errors in Recent Adaptive Attacks Against\n Multi-Resolution Self-Ensembles","summary":" This note documents an implementation issue in recent adaptive attacks (Zhang\net al. [2024]) against the multi-resolution self-ensemble defense (Fort and\nLakshminarayanan [2024]). The implementation allowed adversarial perturbations\nto exceed the standard $L_\\infty = 8/255$ bound by up to a factor of\n20$\\times$, reaching magnitudes of up to $L_\\infty = 160/255$. When attacks are\nproperly constrained within the intended bounds, the defense maintains\nnon-trivial robustness. Beyond highlighting the importance of careful\nvalidation in adversarial machine learning research, our analysis reveals an\nintriguing finding: properly bounded adaptive attacks against strong\nmulti-resolution self-ensembles often align with human perception, suggesting\nthe need to reconsider how we measure adversarial robustness.\n","authors":["Stanislav Fort"],"pdf_url":"https://arxiv.org/pdf/2501.14496v1.pdf","comment":"4 pages, 2 figures, technical note addressing an issue in\n arXiv:2411.14834v1"},{"id":"http://arxiv.org/abs/2501.14495v1","updated":"2025-01-24T13:51:47Z","published":"2025-01-24T13:51:47Z","title":"BILLNET: A Binarized Conv3D-LSTM Network with Logic-gated residual\n architecture for hardware-efficient video inference","summary":" Long Short-Term Memory (LSTM) and 3D convolution (Conv3D) show impressive\nresults for many video-based applications but require large memory and\nintensive computing. Motivated by recent works on hardware-algorithmic\nco-design towards efficient inference, we propose a compact binarized\nConv3D-LSTM model architecture called BILLNET, compatible with a highly\nresource-constrained hardware. Firstly, BILLNET proposes to factorize the\ncostly standard Conv3D by two pointwise convolutions with a grouped convolution\nin-between. Secondly, BILLNET enables binarized weights and activations via a\nMUX-OR-gated residual architecture. Finally, to efficiently train BILLNET, we\npropose a multi-stage training strategy enabling to fully quantize LSTM layers.\nResults on Jester dataset show that our method can obtain high accuracy with\nextremely low memory and computational budgets compared to existing Conv3D\nresource-efficient models.\n","authors":["Van Thien Nguyen","William Guicquero","Gilles Sicard"],"pdf_url":"https://arxiv.org/pdf/2501.14495v1.pdf","comment":"Published at IEEE SiPS 2022"},{"id":"http://arxiv.org/abs/2501.14483v1","updated":"2025-01-24T13:35:59Z","published":"2025-01-24T13:35:59Z","title":"Registration of Longitudinal Liver Examinations for Tumor Progress\n Assessment","summary":" Assessing cancer progression in liver CT scans is a clinical challenge,\nrequiring a comparison of scans at different times for the same patient.\nPractitioners must identify existing tumors, compare them with prior exams,\nidentify new tumors, and evaluate overall disease evolution. This process is\nparticularly complex in liver examinations due to misalignment between exams\ncaused by several factors. Indeed, longitudinal liver examinations can undergo\ndifferent non-pathological and pathological changes due to non-rigid\ndeformations, the appearance or disappearance of pathologies, and other\nvariations. In such cases, existing registration approaches, mainly based on\nintrinsic features may distort tumor regions, biasing the tumor progress\nevaluation step and the corresponding diagnosis. This work proposes a\nregistration method based only on geometrical and anatomical information from\nliver segmentation, aimed at aligning longitudinal liver images for aided\ndiagnosis. The proposed method is trained and tested on longitudinal liver CT\nscans, with 317 patients for training and 53 for testing. Our experimental\nresults support our claims by showing that our method is better than other\nregistration techniques by providing a smoother deformation while preserving\nthe tumor burden (total volume of tissues considered as tumor) within the\nvolume. Qualitative results emphasize the importance of smooth deformations in\npreserving tumor appearance.\n","authors":["Walid Yassine","Martin Charachon","Céline Hudelot","Roberto Ardon"],"pdf_url":"https://arxiv.org/pdf/2501.14483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04772v2","updated":"2025-01-24T13:10:15Z","published":"2024-06-07T09:17:33Z","title":"REP: Resource-Efficient Prompting for Rehearsal-Free Continual Learning","summary":" Recent rehearsal-free methods, guided by prompts, generally excel in\nvision-related continual learning (CL) scenarios with continuously drifting\ndata. To be deployable on real-world devices, these methods must contain high\nresource efficiency during training. In this paper, we introduce\nResource-Efficient Prompting (REP), which targets improving the resource\nefficiency of prompt-based rehearsal-free methods. Our key focus is on avoiding\ncatastrophic trade-offs with accuracy while trimming computational and memory\ncosts during prompt learning. We achieve this by exploiting swift prompt\nselection that enhances input data using a carefully provisioned model, and by\ndeveloping adaptive token merging (AToM) and layer dropping (ALD) algorithms\nfor the prompt updating stage. AToM and ALD perform selective skipping across\nthe data and model dimensions without compromising task-specific features while\nlearning new tasks. We validate REP's superior resource efficiency over current\nstate-of-the-art ViT- and CNN-based methods through extensive experiments on\nthree image classification datasets.\n","authors":["Sungho Jeon","Xinyue Ma","Kwang In Kim","Myeongjae Jeon"],"pdf_url":"https://arxiv.org/pdf/2406.04772v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10455v2","updated":"2025-01-24T13:00:28Z","published":"2025-01-14T20:02:59Z","title":"PhyDeformer: High-Quality Non-Rigid Garment Registration with\n Physics-Awareness","summary":" We present PhyDeformer, a new deformation method for high-quality garment\nmesh registration. It operates in two phases: In the first phase, a garment\ngrading is performed to achieve a coarse 3D alignment between the mesh template\nand the target mesh, accounting for proportional scaling and fit (e.g. length,\nsize). Then, the graded mesh is refined to align with the fine-grained details\nof the 3D target through an optimization coupled with the Jacobian-based\ndeformation framework. Both quantitative and qualitative evaluations on\nsynthetic and real garments highlight the effectiveness of our method.\n","authors":["Boyang Yu","Frederic Cordier","Hyewon Seo"],"pdf_url":"https://arxiv.org/pdf/2501.10455v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12323v2","updated":"2025-01-24T12:58:30Z","published":"2025-01-21T17:42:06Z","title":"Deep Learning Based Segmentation of Blood Vessels from H&E Stained\n Oesophageal Adenocarcinoma Whole-Slide Images","summary":" Blood vessels (BVs) play a critical role in the Tumor Micro-Environment\n(TME), potentially influencing cancer progression and treatment response.\nHowever, manually quantifying BVs in Hematoxylin and Eosin (H&E) stained images\nis challenging and labor-intensive due to their heterogeneous appearances. We\npropose a novel approach of constructing guiding maps to improve the\nperformance of state-of-the-art segmentation models for BV segmentation, the\nguiding maps encourage the models to learn representative features of BVs. This\nis particularly beneficial for computational pathology, where labeled training\ndata is often limited and large models are prone to overfitting. We have\nquantitative and qualitative results to demonstrate the efficacy of our\napproach in improving segmentation accuracy. In future, we plan to validate\nthis method to segment BVs across various tissue types and investigate the role\nof cellular structures in relation to BVs in the TME.\n","authors":["Jiaqi Lv","Stefan S Antonowicz","Shan E Ahmed Raza"],"pdf_url":"https://arxiv.org/pdf/2501.12323v2.pdf","comment":"Accepted by ISBI 2025"},{"id":"http://arxiv.org/abs/2403.08378v4","updated":"2025-01-24T12:49:23Z","published":"2024-03-13T09:43:14Z","title":"An Adaptive Cost-Sensitive Learning and Recursive Denoising Framework\n for Imbalanced SVM Classification","summary":" Category imbalance is one of the most popular and important issues in the\ndomain of classification. Emotion classification model trained on imbalanced\ndatasets easily leads to unreliable prediction. The traditional machine\nlearning method tends to favor the majority class, which leads to the lack of\nminority class information in the model. Moreover, most existing models will\nproduce abnormal sensitivity issues or performance degradation. We propose a\nrobust learning algorithm based on adaptive cost-sensitivity and recursive\ndenoising, which is a generalized framework and can be incorporated into most\nstochastic optimization algorithms. The proposed method uses the dynamic kernel\ndistance optimization model between the sample and the decision boundary, which\nmakes full use of the sample's prior information. In addition, we also put\nforward an effective method to filter noise, the main idea of which is to judge\nthe noise by finding the nearest neighbors of the minority class. In order to\nevaluate the strength of the proposed method, we not only carry out experiments\non standard datasets but also apply it to emotional classification problems\nwith different imbalance rates (IR). Experimental results show that the\nproposed general framework is superior to traditional methods in Accuracy,\nG-mean, Recall and F1-score.\n","authors":["Lu Jiang","Qi Wang","Yuhang Chang","Jianing Song","Haoyue Fu","Xiaochun Yang"],"pdf_url":"https://arxiv.org/pdf/2403.08378v4.pdf","comment":"22 pages, 41 figures"},{"id":"http://arxiv.org/abs/2501.13554v2","updated":"2025-01-24T12:43:48Z","published":"2025-01-23T10:57:22Z","title":"One-Prompt-One-Story: Free-Lunch Consistent Text-to-Image Generation\n Using a Single Prompt","summary":" Text-to-image generation models can create high-quality images from input\nprompts. However, they struggle to support the consistent generation of\nidentity-preserving requirements for storytelling. Existing approaches to this\nproblem typically require extensive training in large datasets or additional\nmodifications to the original model architectures. This limits their\napplicability across different domains and diverse diffusion model\nconfigurations. In this paper, we first observe the inherent capability of\nlanguage models, coined context consistency, to comprehend identity through\ncontext with a single prompt. Drawing inspiration from the inherent context\nconsistency, we propose a novel training-free method for consistent\ntext-to-image (T2I) generation, termed \"One-Prompt-One-Story\" (1Prompt1Story).\nOur approach 1Prompt1Story concatenates all prompts into a single input for T2I\ndiffusion models, initially preserving character identities. We then refine the\ngeneration process using two novel techniques: Singular-Value Reweighting and\nIdentity-Preserving Cross-Attention, ensuring better alignment with the input\ndescription for each frame. In our experiments, we compare our method against\nvarious existing consistent T2I generation approaches to demonstrate its\neffectiveness through quantitative metrics and qualitative assessments. Code is\navailable at https://github.com/byliutao/1Prompt1Story.\n","authors":["Tao Liu","Kai Wang","Senmao Li","Joost van de Weijer","Fahad Shahbaz Khan","Shiqi Yang","Yaxing Wang","Jian Yang","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.13554v2.pdf","comment":"28 pages, 22 figures, ICLR2025 conference"},{"id":"http://arxiv.org/abs/2501.14455v1","updated":"2025-01-24T12:35:36Z","published":"2025-01-24T12:35:36Z","title":"Triple Path Enhanced Neural Architecture Search for Multimodal Fake News\n Detection","summary":" Multimodal fake news detection has become one of the most crucial issues on\nsocial media platforms. Although existing methods have achieved advanced\nperformance, two main challenges persist: (1) Under-performed multimodal news\ninformation fusion due to model architecture solidification, and (2) weak\ngeneralization ability on partial-modality contained fake news. To meet these\nchallenges, we propose a novel and flexible triple path enhanced neural\narchitecture search model MUSE. MUSE includes two dynamic paths for detecting\npartial-modality contained fake news and a static path for exploiting potential\nmultimodal correlations. Experimental results show that MUSE achieves stable\nperformance improvement over the baselines.\n","authors":["Bo Xu","Qiujie Xie","Jiahui Zhou","Linlin Zong"],"pdf_url":"https://arxiv.org/pdf/2501.14455v1.pdf","comment":"This paper has been accepted into the IEEE International Conference\n on Acoustics, Speech, and Signal Processing(ICASSP 2024)"},{"id":"http://arxiv.org/abs/2405.12114v2","updated":"2025-01-24T12:24:58Z","published":"2024-05-20T15:29:26Z","title":"A New Cross-Space Total Variation Regularization Model for Color Image\n Restoration with Quaternion Blur Operator","summary":" The cross-channel deblurring problem in color image processing is difficult\nto solve due to the complex coupling and structural blurring of color pixels.\nUntil now, there are few efficient algorithms that can reduce color artifacts\nin deblurring process. To solve this challenging problem, we present a novel\ncross-space total variation (CSTV) regularization model for color image\ndeblurring by introducing a quaternion blur operator and a cross-color space\nregularization functional. The existence and uniqueness of the solution is\nproved and a new L-curve method is proposed to find a balance of regularization\nterms on different color spaces. The Euler-Lagrange equation is derived to show\nthat CSTV has taken into account the coupling of all color channels and the\nlocal smoothing within each color channel. A quaternion operator splitting\nmethod is firstly proposed to enhance the ability of color artifacts reduction\nof the CSTV regularization model. This strategy also applies to the well-known\ncolor deblurring models. Numerical experiments on color image databases\nillustrate the efficiency and effectiveness of the new model and algorithms.\nThe color images restored by them successfully maintain the color and spatial\ninformation and are of higher quality in terms of PSNR, SSIM, MSE and CIEde2000\nthan the restorations of the-state-of-the-art methods.\n","authors":["Zhigang Jia","Yuelian Xiang","Meixiang Zhao","Tingting Wu","Michael K. Ng"],"pdf_url":"https://arxiv.org/pdf/2405.12114v2.pdf","comment":"15pages,14figures"},{"id":"http://arxiv.org/abs/2501.14439v1","updated":"2025-01-24T12:17:47Z","published":"2025-01-24T12:17:47Z","title":"Optimizing Human Pose Estimation Through Focused Human and Joint Regions","summary":" Human pose estimation has given rise to a broad spectrum of novel and\ncompelling applications, including action recognition, sports analysis, as well\nas surveillance. However, accurate video pose estimation remains an open\nchallenge. One aspect that has been overlooked so far is that existing methods\nlearn motion clues from all pixels rather than focusing on the target human\nbody, making them easily misled and disrupted by unimportant information such\nas background changes or movements of other people. Additionally, while the\ncurrent Transformer-based pose estimation methods has demonstrated impressive\nperformance with global modeling, they struggle with local context perception\nand precise positional identification. In this paper, we try to tackle these\nchallenges from three aspects: (1) We propose a bilayer Human-Keypoint Mask\nmodule that performs coarse-to-fine visual token refinement, which gradually\nzooms in on the target human body and keypoints while masking out unimportant\nfigure regions. (2) We further introduce a novel deformable cross attention\nmechanism and a bidirectional separation strategy to adaptively aggregate\nspatial and temporal motion clues from constrained surrounding contexts. (3) We\nmathematically formulate the deformable cross attention, constraining that the\nmodel focuses solely on the regions centered at the target person body.\nEmpirically, our method achieves state-of-the-art performance on three\nlarge-scale benchmark datasets. A remarkable highlight is that our method\nachieves an 84.8 mean Average Precision (mAP) on the challenging wrist joint,\nwhich significantly outperforms the 81.5 mAP achieved by the current\nstate-of-the-art method on the PoseTrack2017 dataset.\n","authors":["Yingying Jiao","Zhigang Wang","Zhenguang Liu","Shaojing Fan","Sifan Wu","Zheqi Wu","Zhuoyue Xu"],"pdf_url":"https://arxiv.org/pdf/2501.14439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14413v1","updated":"2025-01-24T11:28:17Z","published":"2025-01-24T11:28:17Z","title":"Context-CrackNet: A Context-Aware Framework for Precise Segmentation of\n Tiny Cracks in Pavement images","summary":" The accurate detection and segmentation of pavement distresses, particularly\ntiny and small cracks, are critical for early intervention and preventive\nmaintenance in transportation infrastructure. Traditional manual inspection\nmethods are labor-intensive and inconsistent, while existing deep learning\nmodels struggle with fine-grained segmentation and computational efficiency. To\naddress these challenges, this study proposes Context-CrackNet, a novel\nencoder-decoder architecture featuring the Region-Focused Enhancement Module\n(RFEM) and Context-Aware Global Module (CAGM). These innovations enhance the\nmodel's ability to capture fine-grained local details and global contextual\ndependencies, respectively. Context-CrackNet was rigorously evaluated on ten\npublicly available crack segmentation datasets, covering diverse pavement\ndistress scenarios. The model consistently outperformed 9 state-of-the-art\nsegmentation frameworks, achieving superior performance metrics such as mIoU\nand Dice score, while maintaining competitive inference efficiency. Ablation\nstudies confirmed the complementary roles of RFEM and CAGM, with notable\nimprovements in mIoU and Dice score when both modules were integrated.\nAdditionally, the model's balance of precision and computational efficiency\nhighlights its potential for real-time deployment in large-scale pavement\nmonitoring systems.\n","authors":["Blessing Agyei Kyem","Joshua Kofi Asamoah","Armstrong Aboah"],"pdf_url":"https://arxiv.org/pdf/2501.14413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14404v1","updated":"2025-01-24T11:18:19Z","published":"2025-01-24T11:18:19Z","title":"Kolmogorov Arnold Neural Interpolator for Downscaling and Correcting\n Meteorological Fields from In-Situ Observations","summary":" Obtaining accurate weather forecasts at station locations is a critical\nchallenge due to systematic biases arising from the mismatch between\nmulti-scale, continuous atmospheric characteristic and their discrete, gridded\nrepresentations. Previous works have primarily focused on modeling gridded\nmeteorological data, inherently neglecting the off-grid, continuous nature of\natmospheric states and leaving such biases unresolved. To address this, we\npropose the Kolmogorov Arnold Neural Interpolator (KANI), a novel framework\nthat redefines meteorological field representation as continuous neural\nfunctions derived from discretized grids. Grounded in the Kolmogorov Arnold\ntheorem, KANI captures the inherent continuity of atmospheric states and\nleverages sparse in-situ observations to correct these biases systematically.\nFurthermore, KANI introduces an innovative zero-shot downscaling capability,\nguided by high-resolution topographic textures without requiring\nhigh-resolution meteorological fields for supervision. Experimental results\nacross three sub-regions of the continental United States indicate that KANI\nachieves an accuracy improvement of 40.28% for temperature and 67.41% for wind\nspeed, highlighting its significant improvement over traditional interpolation\nmethods. This enables continuous neural representation of meteorological\nvariables through neural networks, transcending the limitations of conventional\ngrid-based representations.\n","authors":["Zili Liu","Hao Chen","Lei Bai","Wenyuan Li","Zhengxia Zou","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2501.14404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14401v1","updated":"2025-01-24T11:14:35Z","published":"2025-01-24T11:14:35Z","title":"CVOCSemRPL: Class-Variance Optimized Clustering, Semantic Information\n Injection and Restricted Pseudo Labeling based Improved Semi-Supervised\n Few-Shot Learning","summary":" Few-shot learning has been extensively explored to address problems where the\namount of labeled samples is very limited for some classes. In the\nsemi-supervised few-shot learning setting, substantial quantities of unlabeled\nsamples are available. Such unlabeled samples are generally cheaper to obtain\nand can be used to improve the few-shot learning performance of the model. Some\nof the recent methods for this setting rely on clustering to generate\npseudo-labels for the unlabeled samples. Since the quality of the\nrepresentation learned by the model heavily influences the effectiveness of\nclustering, this might also lead to incorrect labeling of the unlabeled samples\nand consequently lead to a drop in the few-shot learning performance. We\npropose an approach for semi-supervised few-shot learning that performs a\nclass-variance optimized clustering in order to improve the effectiveness of\nclustering the labeled and unlabeled samples in this setting. It also optimizes\nthe clustering-based pseudo-labeling process using a restricted pseudo-labeling\napproach and performs semantic information injection in order to improve the\nsemi-supervised few-shot learning performance of the model. We experimentally\ndemonstrate that our proposed approach significantly outperforms recent\nstate-of-the-art methods on the benchmark datasets.\n","authors":["Rhythm Baghel","Souvik Maji","Pratik Mazumder"],"pdf_url":"https://arxiv.org/pdf/2501.14401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23085v3","updated":"2025-01-24T10:46:53Z","published":"2024-10-30T15:00:06Z","title":"S3PT: Scene Semantics and Structure Guided Clustering to Boost\n Self-Supervised Pre-Training for Autonomous Driving","summary":" Recent self-supervised clustering-based pre-training techniques like DINO and\nCribo have shown impressive results for downstream detection and segmentation\ntasks. However, real-world applications such as autonomous driving face\nchallenges with imbalanced object class and size distributions and complex\nscene geometries. In this paper, we propose S3PT a novel scene semantics and\nstructure guided clustering to provide more scene-consistent objectives for\nself-supervised training. Specifically, our contributions are threefold: First,\nwe incorporate semantic distribution consistent clustering to encourage better\nrepresentation of rare classes such as motorcycles or animals. Second, we\nintroduce object diversity consistent spatial clustering, to handle imbalanced\nand diverse object sizes, ranging from large background areas to small objects\nsuch as pedestrians and traffic signs. Third, we propose a depth-guided spatial\nclustering to regularize learning based on geometric information of the scene,\nthus further refining region separation on the feature level. Our learned\nrepresentations significantly improve performance in downstream semantic\nsegmentation and 3D object detection tasks on the nuScenes, nuImages, and\nCityscapes datasets and show promising domain translation properties.\n","authors":["Maciej K. Wozniak","Hariprasath Govindarajan","Marvin Klingner","Camille Maurice","B Ravi Kiran","Senthil Yogamani"],"pdf_url":"https://arxiv.org/pdf/2410.23085v3.pdf","comment":"Accepted for WACV 2025 (Oral)"},{"id":"http://arxiv.org/abs/2401.15578v3","updated":"2025-01-24T10:29:12Z","published":"2024-01-28T06:23:55Z","title":"ASCNet: Asymmetric Sampling Correction Network for Infrared Image\n Destriping","summary":" In a real-world infrared imaging system, effectively learning a consistent\nstripe noise removal model is essential. Most existing destriping methods\ncannot precisely reconstruct images due to cross-level semantic gaps and\ninsufficient characterization of the global column features. To tackle this\nproblem, we propose a novel infrared image destriping method, called Asymmetric\nSampling Correction Network (ASCNet), that can effectively capture global\ncolumn relationships and embed them into a U-shaped framework, providing\ncomprehensive discriminative representation and seamless semantic connectivity.\nOur ASCNet consists of three core elements: Residual Haar Discrete Wavelet\nTransform (RHDWT), Pixel Shuffle (PS), and Column Non-uniformity Correction\nModule (CNCM). Specifically, RHDWT is a novel downsampler that employs\ndouble-branch modeling to effectively integrate stripe-directional prior\nknowledge and data-driven semantic interaction to enrich the feature\nrepresentation. Observing the semantic patterns crosstalk of stripe noise, PS\nis introduced as an upsampler to prevent excessive apriori decoding and\nperforming semantic-bias-free image reconstruction. After each sampling, CNCM\ncaptures the column relationships in long-range dependencies. By incorporating\ncolumn, spatial, and self-dependence information, CNCM well establishes a\nglobal context to distinguish stripes from the scene's vertical structures.\nExtensive experiments on synthetic data, real data, and infrared small target\ndetection tasks demonstrate that the proposed method outperforms\nstate-of-the-art single-image destriping methods both visually and\nquantitatively. Our code will be made publicly available at\nhttps://github.com/xdFai/ASCNet.\n","authors":["Shuai Yuan","Hanlin Qin","Xiang Yan","Shiqi Yang","Shuowen Yang","Naveed Akhtar","Huixin Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.15578v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14379v1","updated":"2025-01-24T10:28:05Z","published":"2025-01-24T10:28:05Z","title":"ECTIL: Label-efficient Computational Tumour Infiltrating Lymphocyte\n (TIL) assessment in breast cancer: Multicentre validation in 2,340 patients\n with breast cancer","summary":" The level of tumour-infiltrating lymphocytes (TILs) is a prognostic factor\nfor patients with (triple-negative) breast cancer (BC). Computational TIL\nassessment (CTA) has the potential to assist pathologists in this\nlabour-intensive task, but current CTA models rely heavily on many detailed\nannotations. We propose and validate a fundamentally simpler deep learning\nbased CTA that can be trained in only ten minutes on hundredfold fewer\npathologist annotations. We collected whole slide images (WSIs) with TILs\nscores and clinical data of 2,340 patients with BC from six cohorts including\nthree randomised clinical trials. Morphological features were extracted from\nwhole slide images (WSIs) using a pathology foundation model. Our\nlabel-efficient Computational stromal TIL assessment model (ECTIL) directly\nregresses the TILs score from these features. ECTIL trained on only a few\nhundred samples (ECTIL-TCGA) showed concordance with the pathologist over five\nheterogeneous external cohorts (r=0.54-0.74, AUROC=0.80-0.94). Training on all\nslides of five cohorts (ECTIL-combined) improved results on a held-out test set\n(r=0.69, AUROC=0.85). Multivariable Cox regression analyses indicated that\nevery 10% increase of ECTIL scores was associated with improved overall\nsurvival independent of clinicopathological variables (HR 0.86, p<0.01),\nsimilar to the pathologist score (HR 0.87, p<0.001). We demonstrate that ECTIL\nis highly concordant with an expert pathologist and obtains a similar hazard\nratio. ECTIL has a fundamentally simpler design than existing methods and can\nbe trained on orders of magnitude fewer annotations. Such a CTA may be used to\npre-screen patients for, e.g., immunotherapy clinical trial inclusion, or as a\ntool to assist clinicians in the diagnostic work-up of patients with BC. Our\nmodel is available under an open source licence\n(https://github.com/nki-ai/ectil).\n","authors":["Yoni Schirris","Rosie Voorthuis","Mark Opdam","Marte Liefaard","Gabe S Sonke","Gwen Dackus","Vincent de Jong","Yuwei Wang","Annelot Van Rossum","Tessa G Steenbruggen","Lars C Steggink","Liesbeth G. E. de Vries","Marc van de Vijver","Roberto Salgado","Efstratios Gavves","Paul J van Diest","Sabine C Linn","Jonas Teuwen","Renee Menezes","Marleen Kok","Hugo Horlings"],"pdf_url":"https://arxiv.org/pdf/2501.14379v1.pdf","comment":"Under review. 54 pages including supplementary materials, 2 main\n tables, 3 main figures, 14 supplementary figures, 4 supplementary tables"},{"id":"http://arxiv.org/abs/2501.14369v1","updated":"2025-01-24T10:00:47Z","published":"2025-01-24T10:00:47Z","title":"Low-rank Prompt Interaction for Continual Vision-Language Retrieval","summary":" Research on continual learning in multi-modal tasks has been receiving\nincreasing attention. However, most existing work overlooks the explicit\ncross-modal and cross-task interactions. In this paper, we innovatively propose\nthe Low-rank Prompt Interaction (LPI) to address this general problem of\nmulti-modal understanding, which considers both cross-modal and cross-task\ninteractions. Specifically, as for the former, we employ multi-modal\ncorrelation modules for corresponding Transformer layers. Considering that the\ntraining parameters scale to the number of layers and tasks, we propose\nlow-rank interaction-augmented decomposition to avoid memory explosion while\nenhancing the cross-modal association through sharing and separating\ncommon-specific low-rank factors. In addition, due to the multi-modal semantic\ndifferences carried by the low-rank initialization, we adopt hierarchical\nlow-rank contrastive learning to ensure training robustness. As for the latter,\nwe initially employ a visual analysis and identify that different tasks have\nclear distinctions in proximity. Therefore, we introduce explicit task\ncontrastive constraints in the prompt learning process based on task semantic\ndistances. Experiments on two retrieval tasks show performance improvements\nwith the introduction of a minimal number of parameters, demonstrating the\neffectiveness of our method. Code is available at\nhttps://github.com/Kelvin-ywc/LPI.\n","authors":["Weicai Yan","Ye Wang","Wang Lin","Zirun Guo","Zhou Zhao","Tao Jin"],"pdf_url":"https://arxiv.org/pdf/2501.14369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14356v1","updated":"2025-01-24T09:45:16Z","published":"2025-01-24T09:45:16Z","title":"Causal-Inspired Multitask Learning for Video-Based Human Pose Estimation","summary":" Video-based human pose estimation has long been a fundamental yet challenging\nproblem in computer vision. Previous studies focus on spatio-temporal modeling\nthrough the enhancement of architecture design and optimization strategies.\nHowever, they overlook the causal relationships in the joints, leading to\nmodels that may be overly tailored and thus estimate poorly to challenging\nscenes. Therefore, adequate causal reasoning capability, coupled with good\ninterpretability of model, are both indispensable and prerequisite for\nachieving reliable results. In this paper, we pioneer a causal perspective on\npose estimation and introduce a causal-inspired multitask learning framework,\nconsisting of two stages. \\textit{In the first stage}, we try to endow the\nmodel with causal spatio-temporal modeling ability by introducing two\nself-supervision auxiliary tasks. Specifically, these auxiliary tasks enable\nthe network to infer challenging keypoints based on observed keypoint\ninformation, thereby imbuing causal reasoning capabilities into the model and\nmaking it robust to challenging scenes. \\textit{In the second stage}, we argue\nthat not all feature tokens contribute equally to pose estimation. Prioritizing\ncausal (keypoint-relevant) tokens is crucial to achieve reliable results, which\ncould improve the interpretability of the model. To this end, we propose a\nToken Causal Importance Selection module to identify the causal tokens and\nnon-causal tokens (\\textit{e.g.}, background and objects). Additionally,\nnon-causal tokens could provide potentially beneficial cues but may be\nredundant. We further introduce a non-causal tokens clustering module to merge\nthe similar non-causal tokens. Extensive experiments show that our method\noutperforms state-of-the-art methods on three large-scale benchmark datasets.\n","authors":["Haipeng Chen","Sifan Wu","Zhigang Wang","Yifang Yin","Yingying Jiao","Yingda Lyu","Zhenguang Liu"],"pdf_url":"https://arxiv.org/pdf/2501.14356v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.08270v2","updated":"2025-01-24T09:18:44Z","published":"2024-08-15T17:14:57Z","title":"HeightLane: BEV Heightmap guided 3D Lane Detection","summary":" Accurate 3D lane detection from monocular images presents significant\nchallenges due to depth ambiguity and imperfect ground modeling. Previous\nattempts to model the ground have often used a planar ground assumption with\nlimited degrees of freedom, making them unsuitable for complex road\nenvironments with varying slopes. Our study introduces HeightLane, an\ninnovative method that predicts a height map from monocular images by creating\nanchors based on a multi-slope assumption. This approach provides a detailed\nand accurate representation of the ground. HeightLane employs the predicted\nheightmap along with a deformable attention-based spatial feature transform\nframework to efficiently convert 2D image features into 3D bird's eye view\n(BEV) features, enhancing spatial understanding and lane structure recognition.\nAdditionally, the heightmap is used for the positional encoding of BEV\nfeatures, further improving their spatial accuracy. This explicit view\ntransformation bridges the gap between front-view perceptions and spatially\naccurate BEV representations, significantly improving detection performance. To\naddress the lack of the necessary ground truth (GT) height map in the original\nOpenLane dataset, we leverage the Waymo dataset and accumulate its LiDAR data\nto generate a height map for the drivable area of each scene. The GT heightmaps\nare used to train the heightmap extraction module from monocular images.\nExtensive experiments on the OpenLane validation set show that HeightLane\nachieves state-of-the-art performance in terms of F-score, highlighting its\npotential in real-world applications.\n","authors":["Chaesong Park","Eunbin Seo","Jongwoo Lim"],"pdf_url":"https://arxiv.org/pdf/2408.08270v2.pdf","comment":"10 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2501.14338v1","updated":"2025-01-24T09:03:27Z","published":"2025-01-24T09:03:27Z","title":"Correlation-Based Band Selection for Hyperspectral Image Classification","summary":" Hyperspectral images offer extensive spectral information about ground\nobjects across multiple spectral bands. However, the large volume of data can\npose challenges during processing. Typically, adjacent bands in hyperspectral\ndata are highly correlated, leading to the use of only a few selected bands for\nvarious applications. In this work, we present a correlation-based band\nselection approach for hyperspectral image classification. Our approach\ncalculates the average correlation between bands using correlation coefficients\nto identify the relationships among different bands. Afterward, we select a\nsubset of bands by analyzing the average correlation and applying a\nthreshold-based method. This allows us to isolate and retain bands that exhibit\nlower inter-band dependencies, ensuring that the selected bands provide diverse\nand non-redundant information. We evaluate our proposed approach on two\nstandard benchmark datasets: Pavia University (PA) and Salinas Valley (SA),\nfocusing on image classification tasks. The experimental results demonstrate\nthat our method performs competitively with other standard band selection\napproaches.\n","authors":["Dibyabha Deb","Ujjwal Verma"],"pdf_url":"https://arxiv.org/pdf/2501.14338v1.pdf","comment":"5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2401.00766v5","updated":"2025-01-24T08:47:23Z","published":"2024-01-01T14:14:35Z","title":"Exposure Bracketing Is All You Need For A High-Quality Image","summary":" It is highly desired but challenging to acquire high-quality photos with\nclear content in low-light environments. Although multi-image processing\nmethods (using burst, dual-exposure, or multi-exposure images) have made\nsignificant progress in addressing this issue, they typically focus on specific\nrestoration or enhancement problems, and do not fully explore the potential of\nutilizing multiple images. Motivated by the fact that multi-exposure images are\ncomplementary in denoising, deblurring, high dynamic range imaging, and\nsuper-resolution, we propose to utilize exposure bracketing photography to get\na high-quality image by combining these tasks in this work. Due to the\ndifficulty in collecting real-world pairs, we suggest a solution that first\npre-trains the model with synthetic paired data and then adapts it to\nreal-world unlabeled images. In particular, a temporally modulated recurrent\nnetwork (TMRNet) and self-supervised adaptation method are proposed. Moreover,\nwe construct a data simulation pipeline to synthesize pairs and collect\nreal-world images from 200 nighttime scenarios. Experiments on both datasets\nshow that our method performs favorably against the state-of-the-art\nmulti-image processing ones. Code and datasets are available at\nhttps://github.com/cszhilu1998/BracketIRE.\n","authors":["Zhilu Zhang","Shuohao Zhang","Renlong Wu","Zifei Yan","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2401.00766v5.pdf","comment":"ICLR 2025"},{"id":"http://arxiv.org/abs/2501.14323v1","updated":"2025-01-24T08:35:22Z","published":"2025-01-24T08:35:22Z","title":"Automatic detection and prediction of nAMD activity change in retinal\n OCT using Siamese networks and Wasserstein Distance for ordinality","summary":" Neovascular age-related macular degeneration (nAMD) is a leading cause of\nvision loss among older adults, where disease activity detection and\nprogression prediction are critical for nAMD management in terms of timely drug\nadministration and improving patient outcomes. Recent advancements in deep\nlearning offer a promising solution for predicting changes in AMD from optical\ncoherence tomography (OCT) retinal volumes. In this work, we proposed deep\nlearning models for the two tasks of the public MARIO Challenge at MICCAI 2024,\ndesigned to detect and forecast changes in nAMD severity with longitudinal\nretinal OCT. For the first task, we employ a Vision Transformer (ViT) based\nSiamese Network to detect changes in AMD severity by comparing scan embeddings\nof a patient from different time points. To train a model to forecast the\nchange after 3 months, we exploit, for the first time, an Earth Mover\n(Wasserstein) Distance-based loss to harness the ordinal relation within the\nseverity change classes. Both models ranked high on the preliminary\nleaderboard, demonstrating that their predictive capabilities could facilitate\nnAMD treatment management.\n","authors":["Taha Emre","Teresa Araújo","Marzieh Oghbaie","Dmitrii Lachinov","Guilherme Aresta","Hrvoje Bogunović"],"pdf_url":"https://arxiv.org/pdf/2501.14323v1.pdf","comment":"Solution to the MICCAI 2024 MARIO Challange. First 3 authors\n contributed equally. Models can be found at\n https://github.com/EmreTaha/Siamese-EMD-for-AMD-Change"},{"id":"http://arxiv.org/abs/2501.14319v1","updated":"2025-01-24T08:25:48Z","published":"2025-01-24T08:25:48Z","title":"Scalable Benchmarking and Robust Learning for Noise-Free Ego-Motion and\n 3D Reconstruction from Noisy Video","summary":" We aim to redefine robust ego-motion estimation and photorealistic 3D\nreconstruction by addressing a critical limitation: the reliance on noise-free\ndata in existing models. While such sanitized conditions simplify evaluation,\nthey fail to capture the unpredictable, noisy complexities of real-world\nenvironments. Dynamic motion, sensor imperfections, and synchronization\nperturbations lead to sharp performance declines when these models are deployed\nin practice, revealing an urgent need for frameworks that embrace and excel\nunder real-world noise. To bridge this gap, we tackle three core challenges:\nscalable data generation, comprehensive benchmarking, and model robustness\nenhancement. First, we introduce a scalable noisy data synthesis pipeline that\ngenerates diverse datasets simulating complex motion, sensor imperfections, and\nsynchronization errors. Second, we leverage this pipeline to create\nRobust-Ego3D, a benchmark rigorously designed to expose noise-induced\nperformance degradation, highlighting the limitations of current learning-based\nmethods in ego-motion accuracy and 3D reconstruction quality. Third, we propose\nCorrespondence-guided Gaussian Splatting (CorrGS), a novel test-time adaptation\nmethod that progressively refines an internal clean 3D representation by\naligning noisy observations with rendered RGB-D frames from clean 3D map,\nenhancing geometric alignment and appearance restoration through visual\ncorrespondence. Extensive experiments on synthetic and real-world data\ndemonstrate that CorrGS consistently outperforms prior state-of-the-art\nmethods, particularly in scenarios involving rapid motion and dynamic\nillumination.\n","authors":["Xiaohao Xu","Tianyi Zhang","Shibo Zhao","Xiang Li","Sibo Wang","Yongqi Chen","Ye Li","Bhiksha Raj","Matthew Johnson-Roberson","Sebastian Scherer","Xiaonan Huang"],"pdf_url":"https://arxiv.org/pdf/2501.14319v1.pdf","comment":"Accepted by ICLR 2025; 92 Pages; Project Repo:\n https://github.com/Xiaohao-Xu/SLAM-under-Perturbation. arXiv admin note:\n substantial text overlap with arXiv:2406.16850"},{"id":"http://arxiv.org/abs/2501.14317v1","updated":"2025-01-24T08:22:02Z","published":"2025-01-24T08:22:02Z","title":"Nautilus: Locality-aware Autoencoder for Scalable Mesh Generation","summary":" Triangle meshes are fundamental to 3D applications, enabling efficient\nmodification and rasterization while maintaining compatibility with standard\nrendering pipelines. However, current automatic mesh generation methods\ntypically rely on intermediate representations that lack the continuous surface\nquality inherent to meshes. Converting these representations into meshes\nproduces dense, suboptimal outputs. Although recent autoregressive approaches\ndemonstrate promise in directly modeling mesh vertices and faces, they are\nconstrained by the limitation in face count, scalability, and structural\nfidelity. To address these challenges, we propose Nautilus, a locality-aware\nautoencoder for artist-like mesh generation that leverages the local properties\nof manifold meshes to achieve structural fidelity and efficient representation.\nOur approach introduces a novel tokenization algorithm that preserves face\nproximity relationships and compresses sequence length through locally shared\nvertices and edges, enabling the generation of meshes with an unprecedented\nscale of up to 5,000 faces. Furthermore, we develop a Dual-stream Point\nConditioner that provides multi-scale geometric guidance, ensuring global\nconsistency and local structural fidelity by capturing fine-grained geometric\nfeatures. Extensive experiments demonstrate that Nautilus significantly\noutperforms state-of-the-art methods in both fidelity and scalability.\n","authors":["Yuxuan Wang","Xuanyu Yi","Haohan Weng","Qingshan Xu","Xiaokang Wei","Xianghui Yang","Chunchao Guo","Long Chen","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.14317v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2501.14316v1","updated":"2025-01-24T08:21:35Z","published":"2025-01-24T08:21:35Z","title":"PAID: A Framework of Product-Centric Advertising Image Design","summary":" In E-commerce platforms, a full advertising image is composed of a background\nimage and marketing taglines. Automatic ad image design reduces human costs and\nplays a crucial role. For the convenience of users, a novel automatic framework\nnamed Product-Centric Advertising Image Design (PAID) is proposed in this work.\nPAID takes the product foreground image, required taglines, and target size as\ninput and creates an ad image automatically. PAID consists of four sequential\nstages: prompt generation, layout generation, background image generation, and\ngraphics rendering. Different expert models are trained to conduct these\nsub-tasks. A visual language model (VLM) based prompt generation model is\nleveraged to produce a product-matching background prompt. The layout\ngeneration model jointly predicts text and image layout according to the\nbackground prompt, product, and taglines to achieve the best harmony. An\nSDXL-based layout-controlled inpainting model is trained to generate an\naesthetic background image. Previous ad image design methods take a background\nimage as input and then predict the layout of taglines, which limits the\nspatial layout due to fixed image content. Innovatively, our PAID adjusts the\nstages to produce an unrestricted layout. To complete the PAID framework, we\ncreated two high-quality datasets, PITA and PIL. Extensive experimental results\nshow that PAID creates more visually pleasing advertising images than previous\nmethods.\n","authors":["Hongyu Chen","Min Zhou","Jing Jiang","Jiale Chen","Yang Lu","Bo Xiao","Tiezheng Ge","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.14316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12104v3","updated":"2025-01-24T08:20:19Z","published":"2025-01-21T12:55:04Z","title":"Teacher Encoder-Student Decoder Denoising Guided Segmentation Network\n for Anomaly Detection","summary":" Visual anomaly detection is a highly challenging task, often categorized as a\none-class classification and segmentation problem. Recent studies have\ndemonstrated that the student-teacher (S-T) framework effectively addresses\nthis challenge. However, most S-T frameworks rely solely on pre-trained teacher\nnetworks to guide student networks in learning multi-scale similar features,\noverlooking the potential of the student networks to enhance learning through\nmulti-scale feature fusion. In this study, we propose a novel model named\nPFADSeg, which integrates a pre-trained teacher network, a denoising student\nnetwork with multi-scale feature fusion, and a guided anomaly segmentation\nnetwork into a unified framework. By adopting a unique teacher-encoder and\nstudent-decoder denoising mode, the model improves the student network's\nability to learn from teacher network features. Furthermore, an adaptive\nfeature fusion mechanism is introduced to train a self-supervised segmentation\nnetwork that synthesizes anomaly masks autonomously, significantly increasing\ndetection performance. Evaluated on the MVTec AD dataset, PFADSeg achieves\nstate-of-the-art results with an image-level AUC of 98.9%, a pixel-level mean\nprecision of 76.4%, and an instance-level mean precision of 78.7%.\n","authors":["Shixuan Song","Hao Chen","Shu Hu","Xin Wang","Jinrong Hu","Xi Wu"],"pdf_url":"https://arxiv.org/pdf/2501.12104v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14309v1","updated":"2025-01-24T08:10:47Z","published":"2025-01-24T08:10:47Z","title":"BrainGuard: Privacy-Preserving Multisubject Image Reconstructions from\n Brain Activities","summary":" Reconstructing perceived images from human brain activity forms a crucial\nlink between human and machine learning through Brain-Computer Interfaces.\nEarly methods primarily focused on training separate models for each individual\nto account for individual variability in brain activity, overlooking valuable\ncross-subject commonalities. Recent advancements have explored multisubject\nmethods, but these approaches face significant challenges, particularly in data\nprivacy and effectively managing individual variability. To overcome these\nchallenges, we introduce BrainGuard, a privacy-preserving collaborative\ntraining framework designed to enhance image reconstruction from multisubject\nfMRI data while safeguarding individual privacy. BrainGuard employs a\ncollaborative global-local architecture where individual models are trained on\neach subject's local data and operate in conjunction with a shared global model\nthat captures and leverages cross-subject patterns. This architecture\neliminates the need to aggregate fMRI data across subjects, thereby ensuring\nprivacy preservation. To tackle the complexity of fMRI data, BrainGuard\nintegrates a hybrid synchronization strategy, enabling individual models to\ndynamically incorporate parameters from the global model. By establishing a\nsecure and collaborative training environment, BrainGuard not only protects\nsensitive brain data but also improves the image reconstructions accuracy.\nExtensive experiments demonstrate that BrainGuard sets a new benchmark in both\nhigh-level and low-level metrics, advancing the state-of-the-art in brain\ndecoding through its innovative design.\n","authors":["Zhibo Tian","Ruijie Quan","Fan Ma","Kun Zhan","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2501.14309v1.pdf","comment":"AAAI 2025 oral"},{"id":"http://arxiv.org/abs/2501.14308v1","updated":"2025-01-24T08:10:05Z","published":"2025-01-24T08:10:05Z","title":"Learning Primitive Relations for Compositional Zero-Shot Learning","summary":" Compositional Zero-Shot Learning (CZSL) aims to identify unseen state-object\ncompositions by leveraging knowledge learned from seen compositions. Existing\napproaches often independently predict states and objects, overlooking their\nrelationships. In this paper, we propose a novel framework, learning primitive\nrelations (LPR), designed to probabilistically capture the relationships\nbetween states and objects. By employing the cross-attention mechanism, LPR\nconsiders the dependencies between states and objects, enabling the model to\ninfer the likelihood of unseen compositions. Experimental results demonstrate\nthat LPR outperforms state-of-the-art methods on all three CZSL benchmark\ndatasets in both closed-world and open-world settings. Through qualitative\nanalysis, we show that LPR leverages state-object relationships for unseen\ncomposition prediction.\n","authors":["Insu Lee","Jiseob Kim","Kyuhong Shim","Byonghyo Shim"],"pdf_url":"https://arxiv.org/pdf/2501.14308v1.pdf","comment":"Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.14306v1","updated":"2025-01-24T08:05:49Z","published":"2025-01-24T08:05:49Z","title":"Additive Manufacturing Processes Protocol Prediction by Artificial\n Intelligence using X-ray Computed Tomography data","summary":" The quality of the part fabricated from the Additive Manufacturing (AM)\nprocess depends upon the process parameters used, and therefore, optimization\nis required for apt quality. A methodology is proposed to set these parameters\nnon-iteratively without human intervention. It utilizes Artificial Intelligence\n(AI) to fully automate the process, with the capability to self-train any apt\nAI model by further assimilating the training data.This study includes three\ncommercially available 3D printers for soft material printing based on the\nMaterial Extrusion (MEX) AM process. The samples are 3D printed for six\ndifferent AM process parameters obtained by varying layer height and nozzle\nspeed. The novelty part of the methodology is incorporating an AI-based image\nsegmentation step in the decision-making stage that uses quality inspected\ntraining data from the Non-Destructive Testing (NDT) method. The performance of\nthe trained AI model is compared with the two software tools based on the\nclassical thresholding method. The AI-based Artificial Neural Network (ANN)\nmodel is trained from NDT-assessed and AI-segmented data to automate the\nselection of optimized process parameters. The AI-based model is 99.3 %\naccurate, while the best available commercial classical image method is 83.44 %\naccurate. The best value of overall R for training ANN is 0.82. The MEX process\ngives a 22.06 % porosity error relative to the design. The NDT-data trained two\nAI models integrated into a series pipeline for optimal process parameters are\nproposed and verified by classical optimization and mechanical testing methods.\n","authors":["Sunita Khod","Akshay Dvivedi","Mayank Goswami"],"pdf_url":"https://arxiv.org/pdf/2501.14306v1.pdf","comment":"21 pages, 21 figures, 5 tables"},{"id":"http://arxiv.org/abs/2501.14302v1","updated":"2025-01-24T08:00:25Z","published":"2025-01-24T08:00:25Z","title":"TD-RD: A Top-Down Benchmark with Real-Time Framework for Road Damage\n Detection","summary":" Object detection has witnessed remarkable advancements over the past decade,\nlargely driven by breakthroughs in deep learning and the proliferation of large\nscale datasets. However, the domain of road damage detection remains relatively\nunder explored, despite its critical significance for applications such as\ninfrastructure maintenance and road safety. This paper addresses this gap by\nintroducing a novel top down benchmark that offers a complementary perspective\nto existing datasets, specifically tailored for road damage detection. Our\nproposed Top Down Road Damage Detection Dataset (TDRD) includes three primary\ncategories of road damage cracks, potholes, and patches captured from a top\ndown viewpoint. The dataset consists of 7,088 high resolution images,\nencompassing 12,882 annotated instances of road damage. Additionally, we\npresent a novel real time object detection framework, TDYOLOV10, designed to\nhandle the unique challenges posed by the TDRD dataset. Comparative studies\nwith state of the art models demonstrate competitive baseline results. By\nreleasing TDRD, we aim to accelerate research in this crucial area. A sample of\nthe dataset will be made publicly available upon the paper's acceptance.\n","authors":["Xi Xiao","Zhengji Li","Wentao Wang","Jiacheng Xie","Houjie Lin","Swalpa Kumar Roy","Tianyang Wang","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2501.14302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13341v2","updated":"2025-01-24T07:54:10Z","published":"2025-01-23T02:45:35Z","title":"Multi-aspect Knowledge Distillation with Large Language Model","summary":" Recent advancements in deep learning have significantly improved performance\non computer vision tasks. Previous image classification methods primarily\nmodify model architectures or add features, and they optimize models using\ncross-entropy loss on class logits. Since they focus on classifying images with\nconsidering class labels, these methods may struggle to learn various\n\\emph{aspects} of classes (e.g., natural positions and shape changes).\nRethinking the previous approach from a novel view, we propose a multi-aspect\nknowledge distillation method using Multimodal Large Language Models (MLLMs).\nOur approach involves: 1) querying Large Language Model with multi-aspect\nquestions relevant to the knowledge we want to transfer to the model, 2)\nextracting corresponding logits from MLLM, and 3) expanding the model's output\ndimensions to distill these multi-aspect logits. We then apply cross-entropy\nloss to class logits and binary cross-entropy loss to multi-aspect logits.\nThrough our method, the model can learn not only the knowledge about visual\naspects but also the abstract and complex aspects that require a deeper\nunderstanding. We primarily apply our method to image classification, and to\nexplore the potential for extending our model, we expand it to other tasks,\nsuch as object detection. In all experimental results, our method improves the\nperformance of the baselines. Additionally, we analyze the effect of\nmulti-aspect knowledge distillation. These results demonstrate that our method\ncan transfer knowledge about various aspects to the model and the aspect\nknowledge can enhance model performance in computer vision tasks. This paper\ndemonstrates the great potential of multi-aspect knowledge distillation, and we\nbelieve it offers a promising direction for future research in computer vision\nand beyond.\n","authors":["Taegyeong Lee","Jinsik Bang","Soyeong Kwon","Taehwan Kim"],"pdf_url":"https://arxiv.org/pdf/2501.13341v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2212.12770v2","updated":"2025-01-24T07:51:05Z","published":"2022-12-24T16:38:59Z","title":"COLT: Cyclic Overlapping Lottery Tickets for Faster Pruning of\n Convolutional Neural Networks","summary":" Pruning refers to the elimination of trivial weights from neural networks.\nThe sub-networks within an overparameterized model produced after pruning are\noften called Lottery tickets. This research aims to generate winning lottery\ntickets from a set of lottery tickets that can achieve similar accuracy to the\noriginal unpruned network. We introduce a novel winning ticket called Cyclic\nOverlapping Lottery Ticket (COLT) by data splitting and cyclic retraining of\nthe pruned network from scratch. We apply a cyclic pruning algorithm that keeps\nonly the overlapping weights of different pruned models trained on different\ndata segments. Our results demonstrate that COLT can achieve similar accuracies\n(obtained by the unpruned model) while maintaining high sparsities. We show\nthat the accuracy of COLT is on par with the winning tickets of Lottery Ticket\nHypothesis (LTH) and, at times, is better. Moreover, COLTs can be generated\nusing fewer iterations than tickets generated by the popular Iterative\nMagnitude Pruning (IMP) method. In addition, we also notice COLTs generated on\nlarge datasets can be transferred to small ones without compromising\nperformance, demonstrating its generalizing capability. We conduct all our\nexperiments on Cifar-10, Cifar-100 & TinyImageNet datasets and report superior\nperformance than the state-of-the-art methods.\n","authors":["Md. Ismail Hossain","Mohammed Rakib","M. M. Lutfe Elahi","Nabeel Mohammed","Shafin Rahman"],"pdf_url":"https://arxiv.org/pdf/2212.12770v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06814v2","updated":"2025-01-24T07:28:49Z","published":"2024-04-10T08:02:17Z","title":"ComPC: Completing a 3D Point Cloud with 2D Diffusion Priors","summary":" 3D point clouds directly collected from objects through sensors are often\nincomplete due to self-occlusion. Conventional methods for completing these\npartial point clouds rely on manually organized training sets and are usually\nlimited to object categories seen during training. In this work, we propose a\ntest-time framework for completing partial point clouds across unseen\ncategories without any requirement for training. Leveraging point rendering via\nGaussian Splatting, we develop techniques of Partial Gaussian Initialization,\nZero-shot Fractal Completion, and Point Cloud Extraction that utilize priors\nfrom pre-trained 2D diffusion models to infer missing regions and extract\nuniform completed point clouds. Experimental results on both synthetic and\nreal-world scanned point clouds demonstrate that our approach outperforms\nexisting methods in completing a variety of objects. Our project page is at\n\\url{https://tianxinhuang.github.io/projects/ComPC/}.\n","authors":["Tianxin Huang","Zhiwen Yan","Yuyang Zhao","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2404.06814v2.pdf","comment":"Accepted by ICLR 2025"},{"id":"http://arxiv.org/abs/2410.11610v5","updated":"2025-01-24T07:04:50Z","published":"2024-10-15T13:46:19Z","title":"Enhanced Encoder-Decoder Architecture for Accurate Monocular Depth\n Estimation","summary":" Estimating depth from a single 2D image is a challenging task due to the lack\nof stereo or multi-view data, which are typically required for depth\nperception. In state-of-the-art architectures, the main challenge is to\nefficiently capture complex objects and fine-grained details, which are often\ndifficult to predict. This paper introduces a novel deep learning-based\napproach using an enhanced encoder-decoder architecture, where the\nInception-ResNet-v2 model serves as the encoder. This is the first instance of\nutilizing Inception-ResNet-v2 as an encoder for monocular depth estimation,\ndemonstrating improved performance over previous models. It incorporates\nmulti-scale feature extraction to enhance depth prediction accuracy across\nvarious object sizes and distances. We propose a composite loss function\ncomprising depth loss, gradient edge loss, and Structural Similarity Index\nMeasure (SSIM) loss, with fine-tuned weights to optimize the weighted sum,\nensuring a balance across different aspects of depth estimation. Experimental\nresults on the KITTI dataset show that our model achieves a significantly\nfaster inference time of 0.019 seconds, outperforming vision transformers in\nefficiency while maintaining good accuracy. On the NYU Depth V2 dataset, the\nmodel establishes state-of-the-art performance, with an Absolute Relative Error\n(ARE) of 0.064, a Root Mean Square Error (RMSE) of 0.228, and an accuracy of\n89.3% for $\\delta$ < 1.25. These metrics demonstrate that our model can\naccurately and efficiently predict depth even in challenging scenarios,\nproviding a practical solution for real-time applications.\n","authors":["Dabbrata Das","Argho Deb Das","Farhan Sadaf"],"pdf_url":"https://arxiv.org/pdf/2410.11610v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14287v1","updated":"2025-01-24T07:04:27Z","published":"2025-01-24T07:04:27Z","title":"Snapshot multi-spectral imaging through defocusing and a Fourier imager\n network","summary":" Multi-spectral imaging, which simultaneously captures the spatial and\nspectral information of a scene, is widely used across diverse fields,\nincluding remote sensing, biomedical imaging, and agricultural monitoring.\nHere, we introduce a snapshot multi-spectral imaging approach employing a\nstandard monochrome image sensor with no additional spectral filters or\ncustomized components. Our system leverages the inherent chromatic aberration\nof wavelength-dependent defocusing as a natural source of physical encoding of\nmulti-spectral information; this encoded image information is rapidly decoded\nvia a deep learning-based multi-spectral Fourier Imager Network (mFIN). We\nexperimentally tested our method with six illumination bands and demonstrated\nan overall accuracy of 92.98% for predicting the illumination channels at the\ninput and achieved a robust multi-spectral image reconstruction on various test\nobjects. This deep learning-powered framework achieves high-quality\nmulti-spectral image reconstruction using snapshot image acquisition with a\nmonochrome image sensor and could be useful for applications in biomedicine,\nindustrial quality control, and agriculture, among others.\n","authors":["Xilin Yang","Michael John Fanous","Hanlong Chen","Ryan Lee","Paloma Casteleiro Costa","Yuhang Li","Luzhe Huang","Yijie Zhang","Aydogan Ozcan"],"pdf_url":"https://arxiv.org/pdf/2501.14287v1.pdf","comment":"22 Pages, 7 Figures"},{"id":"http://arxiv.org/abs/2501.14279v1","updated":"2025-01-24T06:50:21Z","published":"2025-01-24T06:50:21Z","title":"Deep Learning-Powered Classification of Thoracic Diseases in Chest\n X-Rays","summary":" Chest X-rays play a pivotal role in diagnosing respiratory diseases such as\npneumonia, tuberculosis, and COVID-19, which are prevalent and present unique\ndiagnostic challenges due to overlapping visual features and variability in\nimage quality. Severe class imbalance and the complexity of medical images\nhinder automated analysis. This study leverages deep learning techniques,\nincluding transfer learning on pre-trained models (AlexNet, ResNet, and\nInceptionNet), to enhance disease detection and classification. By fine-tuning\nthese models and incorporating focal loss to address class imbalance,\nsignificant performance improvements were achieved. Grad-CAM visualizations\nfurther enhance model interpretability, providing insights into clinically\nrelevant regions influencing predictions. The InceptionV3 model, for instance,\nachieved a 28% improvement in AUC and a 15% increase in F1-Score. These\nfindings highlight the potential of deep learning to improve diagnostic\nworkflows and support clinical decision-making.\n","authors":["Yiming Lei","Michael Nguyen","Tzu Chia Liu","Hyounkyun Oh"],"pdf_url":"https://arxiv.org/pdf/2501.14279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14277v1","updated":"2025-01-24T06:45:12Z","published":"2025-01-24T06:45:12Z","title":"Dense-SfM: Structure from Motion with Dense Consistent Matching","summary":" We present Dense-SfM, a novel Structure from Motion (SfM) framework designed\nfor dense and accurate 3D reconstruction from multi-view images. Sparse\nkeypoint matching, which traditional SfM methods often rely on, limits both\naccuracy and point density, especially in texture-less areas. Dense-SfM\naddresses this limitation by integrating dense matching with a Gaussian\nSplatting (GS) based track extension which gives more consistent, longer\nfeature tracks. To further improve reconstruction accuracy, Dense-SfM is\nequipped with a multi-view kernelized matching module leveraging transformer\nand Gaussian Process architectures, for robust track refinement across\nmulti-views. Evaluations on the ETH3D and Texture-Poor SfM datasets show that\nDense-SfM offers significant improvements in accuracy and density over\nstate-of-the-art methods.\n","authors":["JongMin Lee","Sungjoo Yoo"],"pdf_url":"https://arxiv.org/pdf/2501.14277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14276v1","updated":"2025-01-24T06:42:06Z","published":"2025-01-24T06:42:06Z","title":"Global Semantic-Guided Sub-image Feature Weight Allocation in\n High-Resolution Large Vision-Language Models","summary":" As the demand for high-resolution image processing in Large Vision-Language\nModels (LVLMs) grows, sub-image partitioning has become a popular approach for\nmitigating visual information loss associated with fixed-resolution processing.\nHowever, existing partitioning methods uniformly process sub-images, resulting\nin suboptimal image understanding. In this work, we reveal that the sub-images\nwith higher semantic relevance to the entire image encapsulate richer visual\ninformation for preserving the model's visual understanding ability. Therefore,\nwe propose the Global Semantic-guided Weight Allocator (GSWA) module, which\ndynamically allocates weights to sub-images based on their relative information\ndensity, emulating human visual attention mechanisms. This approach enables the\nmodel to focus on more informative regions, overcoming the limitations of\nuniform treatment. We integrate GSWA into the InternVL2-2B framework to create\nSleighVL, a lightweight yet high-performing model. Extensive experiments\ndemonstrate that SleighVL outperforms models with comparable parameters and\nremains competitive with larger models. Our work provides a promising direction\nfor more efficient and contextually aware high-resolution image processing in\nLVLMs, advancing multimodal system development.\n","authors":["Yuxuan Liang","Xu Li","Xiaolei Chen","Haotian Chen","Yi Zheng","Chenghang Lai","Bin Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2501.14276v1.pdf","comment":"10 pages, 10 figures and tables"},{"id":"http://arxiv.org/abs/2501.14265v1","updated":"2025-01-24T06:07:11Z","published":"2025-01-24T06:07:11Z","title":"Bayesian Neural Networks for One-to-Many Mapping in Image Enhancement","summary":" In image enhancement tasks, such as low-light and underwater image\nenhancement, a degraded image can correspond to multiple plausible target\nimages due to dynamic photography conditions, such as variations in\nillumination. This naturally results in a one-to-many mapping challenge. To\naddress this, we propose a Bayesian Enhancement Model (BEM) that incorporates\nBayesian Neural Networks (BNNs) to capture data uncertainty and produce diverse\noutputs. To achieve real-time inference, we introduce a two-stage approach:\nStage I employs a BNN to model the one-to-many mappings in the low-dimensional\nspace, while Stage II refines fine-grained image details using a Deterministic\nNeural Network (DNN). To accelerate BNN training and convergence, we introduce\na dynamic \\emph{Momentum Prior}. Extensive experiments on multiple low-light\nand underwater image enhancement benchmarks demonstrate the superiority of our\nmethod over deterministic models.\n","authors":["Guoxi Huang","Nantheera Anantrasirichai","Fei Ye","Zipeng Qi","RuiRui Lin","Qirui Yang","David Bull"],"pdf_url":"https://arxiv.org/pdf/2501.14265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14264v1","updated":"2025-01-24T06:05:47Z","published":"2025-01-24T06:05:47Z","title":"CDI: Blind Image Restoration Fidelity Evaluation based on Consistency\n with Degraded Image","summary":" Recent advancements in Blind Image Restoration (BIR) methods, based on\nGenerative Adversarial Networks and Diffusion Models, have significantly\nimproved visual quality. However, they present significant challenges for Image\nQuality Assessment (IQA), as the existing Full-Reference IQA methods often rate\nimages with high perceptual quality poorly. In this paper, we reassess the\nSolution Non-Uniqueness and Degradation Indeterminacy issues of BIR, and\npropose constructing a specific BIR IQA system. In stead of directly comparing\na restored image with a reference image, the BIR IQA evaluates fidelity by\ncalculating the Consistency with Degraded Image (CDI). Specifically, we propose\na wavelet domain Reference Guided CDI algorithm, which can acquire the\nconsistency with a degraded image for various types without requiring knowledge\nof degradation parameters. The supported degradation types include down\nsampling, blur, noise, JPEG and complex combined degradations etc. In addition,\nwe propose a Reference Agnostic CDI, enabling BIR fidelity evaluation without\nreference images. Finally, in order to validate the rationality of CDI, we\ncreate a new Degraded Images Switch Display Comparison Dataset (DISDCD) for\nsubjective evaluation of BIR fidelity. Experiments conducted on DISDCD verify\nthat CDI is markedly superior to common Full Reference IQA methods for BIR\nfidelity evaluation. The source code and the DISDCD dataset will be publicly\navailable shortly.\n","authors":["Xiaojun Tang","Jingru Wang","Guangwei Huang","Guannan Chen","Rui Zheng","Lian Huai","Yuyu Liu","Xingqun Jiang"],"pdf_url":"https://arxiv.org/pdf/2501.14264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02705v2","updated":"2025-01-24T05:25:24Z","published":"2024-10-03T17:28:07Z","title":"ControlAR: Controllable Image Generation with Autoregressive Models","summary":" Autoregressive (AR) models have reformulated image generation as next-token\nprediction, demonstrating remarkable potential and emerging as strong\ncompetitors to diffusion models. However, control-to-image generation, akin to\nControlNet, remains largely unexplored within AR models. Although a natural\napproach, inspired by advancements in Large Language Models, is to tokenize\ncontrol images into tokens and prefill them into the autoregressive model\nbefore decoding image tokens, it still falls short in generation quality\ncompared to ControlNet and suffers from inefficiency. To this end, we introduce\nControlAR, an efficient and effective framework for integrating spatial\ncontrols into autoregressive image generation models. Firstly, we explore\ncontrol encoding for AR models and propose a lightweight control encoder to\ntransform spatial inputs (e.g., canny edges or depth maps) into control tokens.\nThen ControlAR exploits the conditional decoding method to generate the next\nimage token conditioned on the per-token fusion between control and image\ntokens, similar to positional encodings. Compared to prefilling tokens, using\nconditional decoding significantly strengthens the control capability of AR\nmodels but also maintains the model's efficiency. Furthermore, the proposed\nControlAR surprisingly empowers AR models with arbitrary-resolution image\ngeneration via conditional decoding and specific controls. Extensive\nexperiments can demonstrate the controllability of the proposed ControlAR for\nthe autoregressive control-to-image generation across diverse inputs, including\nedges, depths, and segmentation masks. Furthermore, both quantitative and\nqualitative results indicate that ControlAR surpasses previous state-of-the-art\ncontrollable diffusion models, e.g., ControlNet++. Code, models, and demo will\nsoon be available at https://github.com/hustvl/ControlAR.\n","authors":["Zongming Li","Tianheng Cheng","Shoufa Chen","Peize Sun","Haocheng Shen","Longjin Ran","Xiaoxin Chen","Wenyu Liu","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2410.02705v2.pdf","comment":"To appear in ICLR 2025. Work in progress"},{"id":"http://arxiv.org/abs/2409.07796v2","updated":"2025-01-24T05:24:14Z","published":"2024-09-12T06:56:52Z","title":"In-Situ Fine-Tuning of Wildlife Models in IoT-Enabled Camera Traps for\n Efficient Adaptation","summary":" Resource-constrained IoT devices increasingly rely on deep learning models\nfor inference tasks in remote environments. However, these models experience\nsignificant accuracy drops due to domain shifts when encountering variations in\nlighting, weather, and seasonal conditions. While cloud-based retraining can\naddress this issue, many IoT deployments operate with limited connectivity and\nenergy constraints, making traditional fine-tuning approaches impractical. We\nexplore this challenge through the lens of wildlife ecology, where camera traps\nmust maintain accurate species classification across changing seasons, weather,\nand habitats without reliable connectivity. We introduce WildFit, an autonomous\nin-situ adaptation framework that leverages the key insight that background\nscenes change more frequently than the visual characteristics of monitored\nspecies. WildFit combines background-aware synthesis to generate training\nsamples on-device with drift-aware fine-tuning that triggers model updates only\nwhen necessary to conserve resources. Through extensive evaluation on multiple\ncamera trap deployments, we demonstrate that WildFit significantly improves\naccuracy while greatly reducing adaptation overhead compared to traditional\napproaches.\n","authors":["Mohammad Mehdi Rastikerdar","Jin Huang","Hui Guan","Deepak Ganesan"],"pdf_url":"https://arxiv.org/pdf/2409.07796v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07888v3","updated":"2025-01-24T05:16:36Z","published":"2025-01-14T06:54:39Z","title":"Tarsier2: Advancing Large Vision-Language Models from Detailed Video\n Description to Comprehensive Video Understanding","summary":" We introduce Tarsier2, a state-of-the-art large vision-language model (LVLM)\ndesigned for generating detailed and accurate video descriptions, while also\nexhibiting superior general video understanding capabilities. Tarsier2 achieves\nsignificant advancements through three key upgrades: (1) Scaling pre-training\ndata from 11M to 40M video-text pairs, enriching both volume and diversity; (2)\nPerforming fine-grained temporal alignment during supervised fine-tuning; (3)\nUsing model-based sampling to automatically construct preference data and\napplying DPO training for optimization. Extensive experiments show that\nTarsier2-7B consistently outperforms leading proprietary models, including\nGPT-4o and Gemini 1.5 Pro, in detailed video description tasks. On the DREAM-1K\nbenchmark, Tarsier2-7B improves F1 by 2.8% over GPT-4o and 5.8% over\nGemini-1.5-Pro. In human side-by-side evaluations, Tarsier2-7B shows a +8.6%\nperformance advantage over GPT-4o and +24.9% over Gemini-1.5-Pro. Tarsier2-7B\nalso sets new state-of-the-art results across 15 public benchmarks, spanning\ntasks such as video question-answering, video grounding, hallucination test,\nand embodied question-answering, demonstrating its versatility as a robust\ngeneralist vision-language model.\n","authors":["Liping Yuan","Jiawei Wang","Haomiao Sun","Yuchen Zhang","Yuan Lin"],"pdf_url":"https://arxiv.org/pdf/2501.07888v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14238v1","updated":"2025-01-24T04:50:16Z","published":"2025-01-24T04:50:16Z","title":"Point-LN: A Lightweight Framework for Efficient Point Cloud\n Classification Using Non-Parametric Positional Encoding","summary":" We introduce Point-LN, a novel lightweight framework engineered for efficient\n3D point cloud classification. Point-LN integrates essential non-parametric\ncomponents-such as Farthest Point Sampling (FPS), k-Nearest Neighbors (k-NN),\nand non-learnable positional encoding-with a streamlined learnable classifier\nthat significantly enhances classification accuracy while maintaining a minimal\nparameter footprint. This hybrid architecture ensures low computational costs\nand rapid inference speeds, making Point-LN ideal for real-time and\nresource-constrained applications. Comprehensive evaluations on benchmark\ndatasets, including ModelNet40 and ScanObjectNN, demonstrate that Point-LN\nachieves competitive performance compared to state-of-the-art methods, all\nwhile offering exceptional efficiency. These results establish Point-LN as a\nrobust and scalable solution for diverse point cloud classification tasks,\nhighlighting its potential for widespread adoption in various computer vision\napplications.\n","authors":["Marzieh Mohammadi","Amir Salarpour","Pedram MohajerAnsari"],"pdf_url":"https://arxiv.org/pdf/2501.14238v1.pdf","comment":"This paper has been accepted for presentation at the 29th\n International Computer Conference, Computer Society of Iran (CSICC) 2025"},{"id":"http://arxiv.org/abs/2408.01162v2","updated":"2025-01-24T04:42:06Z","published":"2024-08-02T10:24:35Z","title":"PreMix: Addressing Label Scarcity in Whole Slide Image Classification\n with Pre-trained Multiple Instance Learning Aggregators","summary":" Multiple instance learning (MIL) has emerged as a powerful framework for\nweakly supervised whole slide image (WSI) classification, enabling slide-level\npredictions without requiring detailed patch-level annotations. However, a key\nlimitation of MIL lies in the underexplored potential of pre-training the MIL\naggregator. Most existing approaches train it from scratch, resulting in\nperformance heavily dependent on the number of labeled WSIs, while overlooking\nthe abundance of unlabeled WSIs available in real-world scenarios. To address\nthis, we propose PreMix, a novel framework that leverages a non-contrastive\npre-training method, Barlow Twins, augmented with the Slide Mixing approach to\ngenerate additional positive pairs and enhance feature learning, particularly\nunder limited labeled WSI conditions. Fine-tuning with Mixup and Manifold Mixup\nfurther enhances robustness by effectively handling the diverse sizes of\ngigapixel WSIs. Experimental results demonstrate that integrating HIPT into\nPreMix achieves an average F1 improvement of 4.7% over the baseline HIPT across\nvarious WSI training datasets and label sizes. These findings underscore its\npotential to advance WSI classification with limited labeled data and its\napplicability to real-world histopathology practices. The code is available at\nhttps://anonymous.4open.science/r/PreMix\n","authors":["Bryan Wong","Mun Yong Yi"],"pdf_url":"https://arxiv.org/pdf/2408.01162v2.pdf","comment":"Under review for the Biomedical Signal Processing and Control journal"},{"id":"http://arxiv.org/abs/2501.14231v1","updated":"2025-01-24T04:37:57Z","published":"2025-01-24T04:37:57Z","title":"Micro-macro Wavelet-based Gaussian Splatting for 3D Reconstruction from\n Unconstrained Images","summary":" 3D reconstruction from unconstrained image collections presents substantial\nchallenges due to varying appearances and transient occlusions. In this paper,\nwe introduce Micro-macro Wavelet-based Gaussian Splatting (MW-GS), a novel\napproach designed to enhance 3D reconstruction by disentangling scene\nrepresentations into global, refined, and intrinsic components. The proposed\nmethod features two key innovations: Micro-macro Projection, which allows\nGaussian points to capture details from feature maps across multiple scales\nwith enhanced diversity; and Wavelet-based Sampling, which leverages frequency\ndomain information to refine feature representations and significantly improve\nthe modeling of scene appearances. Additionally, we incorporate a Hierarchical\nResidual Fusion Network to seamlessly integrate these features. Extensive\nexperiments demonstrate that MW-GS delivers state-of-the-art rendering\nperformance, surpassing existing methods.\n","authors":["Yihui Li","Chengxin Lv","Hongyu Yang","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2501.14231v1.pdf","comment":"11 pages, 6 figures,accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.14230v1","updated":"2025-01-24T04:17:03Z","published":"2025-01-24T04:17:03Z","title":"GreedyPixel: Fine-Grained Black-Box Adversarial Attack Via Greedy\n Algorithm","summary":" A critical requirement for deep learning models is ensuring their robustness\nagainst adversarial attacks. These attacks commonly introduce noticeable\nperturbations, compromising the visual fidelity of adversarial examples.\nAnother key challenge is that while white-box algorithms can generate effective\nadversarial perturbations, they require access to the model gradients, limiting\ntheir practicality in many real-world scenarios. Existing attack mechanisms\nstruggle to achieve similar efficacy without access to these gradients. In this\npaper, we introduce GreedyPixel, a novel pixel-wise greedy algorithm designed\nto generate high-quality adversarial examples using only query-based feedback\nfrom the target model. GreedyPixel improves computational efficiency in what is\ntypically a brute-force process by perturbing individual pixels in sequence,\nguided by a pixel-wise priority map. This priority map is constructed by\nranking gradients obtained from a surrogate model, providing a structured path\nfor perturbation. Our results demonstrate that GreedyPixel achieves attack\nsuccess rates comparable to white-box methods without the need for gradient\ninformation, and surpasses existing algorithms in black-box settings, offering\nhigher success rates, reduced computational time, and imperceptible\nperturbations. These findings underscore the advantages of GreedyPixel in terms\nof attack efficacy, time efficiency, and visual quality.\n","authors":["Hanrui Wang","Ching-Chun Chang","Chun-Shien Lu","Christopher Leckie","Isao Echizen"],"pdf_url":"https://arxiv.org/pdf/2501.14230v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14228v1","updated":"2025-01-24T04:16:03Z","published":"2025-01-24T04:16:03Z","title":"Detection and Classification of Acute Lymphoblastic Leukemia Utilizing\n Deep Transfer Learning","summary":" A mutation in the DNA of a single cell that compromises its function\ninitiates leukemia,leading to the overproduction of immature white blood cells\nthat encroach upon the space required for the generation of healthy blood\ncells.Leukemia is treatable if identified in its initial stages. However,its\ndiagnosis is both arduous and time consuming. This study proposes a novel\napproach for diagnosing leukemia across four stages Benign,Early,Pre,and Pro\nusing deep learning techniques.We employed two Convolutional Neural Network\n(CNN) models as MobileNetV2 with an altered head and a custom model. The custom\nmodel consists of multiple convolutional layers,each paired with corresponding\nmax pooling layers.We utilized MobileNetV2 with ImageNet weights,adjusting the\nhead to integrate the final results.The dataset used is the publicly available\n\"Acute Lymphoblastic Leukemia (ALL) Image Dataset\", and we applied the\nSynthetic Minority Oversampling Technique (SMOTE) to augment and balance the\ntraining dataset.The custom model achieved an accuracy of 98.6%, while\nMobileNetV2 attained a superior accuracy of 99.69%. The pretrained model showed\npromising results,indicating an increased likelihood of real-world application.\n","authors":["Md. Abu Ahnaf Mollick","Md. Mahfujur Rahman","D. M. Asadujjaman","Abdullah Tamim","Nosin Anjum Dristi","Md. Takbir Hossen"],"pdf_url":"https://arxiv.org/pdf/2501.14228v1.pdf","comment":"4 pages, 4 figures, Submitted to UCICS"},{"id":"http://arxiv.org/abs/2501.01720v2","updated":"2025-01-24T03:46:28Z","published":"2025-01-03T09:25:04Z","title":"Interpretable Face Anti-Spoofing: Enhancing Generalization with\n Multimodal Large Language Models","summary":" Face Anti-Spoofing (FAS) is essential for ensuring the security and\nreliability of facial recognition systems. Most existing FAS methods are\nformulated as binary classification tasks, providing confidence scores without\ninterpretation. They exhibit limited generalization in out-of-domain scenarios,\nsuch as new environments or unseen spoofing types. In this work, we introduce a\nmultimodal large language model (MLLM) framework for FAS, termed Interpretable\nFace Anti-Spoofing (I-FAS), which transforms the FAS task into an interpretable\nvisual question answering (VQA) paradigm. Specifically, we propose a\nSpoof-aware Captioning and Filtering (SCF) strategy to generate high-quality\ncaptions for FAS images, enriching the model's supervision with natural\nlanguage interpretations. To mitigate the impact of noisy captions during\ntraining, we develop a Lopsided Language Model (L-LM) loss function that\nseparates loss calculations for judgment and interpretation, prioritizing the\noptimization of the former. Furthermore, to enhance the model's perception of\nglobal visual features, we design a Globally Aware Connector (GAC) to align\nmulti-level visual representations with the language model. Extensive\nexperiments on standard and newly devised One to Eleven cross-domain\nbenchmarks, comprising 12 public datasets, demonstrate that our method\nsignificantly outperforms state-of-the-art methods.\n","authors":["Guosheng Zhang","Keyao Wang","Haixiao Yue","Ajian Liu","Gang Zhang","Kun Yao","Errui Ding","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2501.01720v2.pdf","comment":"Accepted to AAAI2025(Oral)"},{"id":"http://arxiv.org/abs/2501.14210v1","updated":"2025-01-24T03:28:37Z","published":"2025-01-24T03:28:37Z","title":"PuzzleGPT: Emulating Human Puzzle-Solving Ability for Time and Location\n Prediction","summary":" The task of predicting time and location from images is challenging and\nrequires complex human-like puzzle-solving ability over different clues. In\nthis work, we formalize this ability into core skills and implement them using\ndifferent modules in an expert pipeline called PuzzleGPT. PuzzleGPT consists of\na perceiver to identify visual clues, a reasoner to deduce prediction\ncandidates, a combiner to combinatorially combine information from different\nclues, a web retriever to get external knowledge if the task can't be solved\nlocally, and a noise filter for robustness. This results in a zero-shot,\ninterpretable, and robust approach that records state-of-the-art performance on\ntwo datasets -- TARA and WikiTilo. PuzzleGPT outperforms large VLMs such as\nBLIP-2, InstructBLIP, LLaVA, and even GPT-4V, as well as automatically\ngenerated reasoning pipelines like VisProg, by at least 32% and 38%,\nrespectively. It even rivals or surpasses finetuned models.\n","authors":["Hammad Ayyubi","Xuande Feng","Junzhang Liu","Xudong Lin","Zhecan Wang","Shih-Fu Chang"],"pdf_url":"https://arxiv.org/pdf/2501.14210v1.pdf","comment":"NAACL 2025 Findings"},{"id":"http://arxiv.org/abs/2501.14208v1","updated":"2025-01-24T03:26:41Z","published":"2025-01-24T03:26:41Z","title":"You Only Teach Once: Learn One-Shot Bimanual Robotic Manipulation from\n Video Demonstrations","summary":" Bimanual robotic manipulation is a long-standing challenge of embodied\nintelligence due to its characteristics of dual-arm spatial-temporal\ncoordination and high-dimensional action spaces. Previous studies rely on\npre-defined action taxonomies or direct teleoperation to alleviate or\ncircumvent these issues, often making them lack simplicity, versatility and\nscalability. Differently, we believe that the most effective and efficient way\nfor teaching bimanual manipulation is learning from human demonstrated videos,\nwhere rich features such as spatial-temporal positions, dynamic postures,\ninteraction states and dexterous transitions are available almost for free. In\nthis work, we propose the YOTO (You Only Teach Once), which can extract and\nthen inject patterns of bimanual actions from as few as a single binocular\nobservation of hand movements, and teach dual robot arms various complex tasks.\nFurthermore, based on keyframes-based motion trajectories, we devise a subtle\nsolution for rapidly generating training demonstrations with diverse variations\nof manipulated objects and their locations. These data can then be used to\nlearn a customized bimanual diffusion policy (BiDP) across diverse scenes. In\nexperiments, YOTO achieves impressive performance in mimicking 5 intricate\nlong-horizon bimanual tasks, possesses strong generalization under different\nvisual and spatial conditions, and outperforms existing visuomotor imitation\nlearning methods in accuracy and efficiency. Our project link is\nhttps://hnuzhy.github.io/projects/YOTO.\n","authors":["Huayi Zhou","Ruixiang Wang","Yunxin Tai","Yueci Deng","Guiliang Liu","Kui Jia"],"pdf_url":"https://arxiv.org/pdf/2501.14208v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2310.19574v2","updated":"2025-01-24T03:23:47Z","published":"2023-10-30T14:30:27Z","title":"Skip-WaveNet: A Wavelet based Multi-scale Architecture to Trace Snow\n Layers in Radar Echograms","summary":" Airborne radar sensors capture the profile of snow layers present on top of\nan ice sheet. Accurate tracking of these layers is essential to calculate their\nthicknesses, which are required to investigate the contribution of polar ice\ncap melt to sea-level rise. However, automatically processing the radar\nechograms to detect the underlying snow layers is a challenging problem. In our\nwork, we develop wavelet-based multi-scale deep learning architectures for\nthese radar echograms to improve snow layer detection. These architectures\nestimate the layer depths with a mean absolute error of 3.31 pixels and 94.3%\naverage precision, achieving higher generalizability as compared to\nstate-of-the-art snow layer detection networks. These depth estimates also\nagree well with physically drilled stake measurements. Such robust\narchitectures can be used on echograms from future missions to efficiently\ntrace snow layers, estimate their individual thicknesses and thus support\nsea-level rise projection models.\n","authors":["Debvrat Varshney","Masoud Yari","Oluwanisola Ibikunle","Jilu Li","John Paden","Aryya Gangopadhyay","Maryam Rahnemoonfar"],"pdf_url":"https://arxiv.org/pdf/2310.19574v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14204v1","updated":"2025-01-24T03:20:37Z","published":"2025-01-24T03:20:37Z","title":"Dynamic Token Reduction during Generation for Vision Language Models","summary":" Vision-Language Models (VLMs) have achieved notable success in multimodal\ntasks but face practical limitations due to the quadratic complexity of decoder\nattention mechanisms and autoregressive generation. Existing methods like FASTV\nand VTW have achieved notable results in reducing redundant visual tokens, but\nthese approaches focus on pruning tokens in a single forward pass without\nsystematically analyzing the redundancy of visual tokens throughout the entire\ngeneration process. In this paper, we introduce a dynamic pruning strategy\ntailored for VLMs, namedDynamic Rate (DyRate), which progressively adjusts the\ncompression rate during generation. Our analysis of the distribution of\nattention reveals that the importance of visual tokens decreases throughout the\ngeneration process, inspiring us to adopt a more aggressive compression rate.\nBy integrating a lightweight predictor based on attention distribution, our\napproach enables flexible adjustment of pruning rates based on the attention\ndistribution. Our experimental results demonstrate that our method not only\nreduces computational demands but also maintains the quality of responses.\n","authors":["Xiaoyu Liang","Chaofeng Guan","Jiaying Lu","Huiyao Chen","Huan Wang","Haoji Hu"],"pdf_url":"https://arxiv.org/pdf/2501.14204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14198v1","updated":"2025-01-24T03:04:44Z","published":"2025-01-24T03:04:44Z","title":"Sparse Mixture-of-Experts for Non-Uniform Noise Reduction in MRI Images","summary":" Magnetic Resonance Imaging (MRI) is an essential diagnostic tool in clinical\nsettings, but its utility is often hindered by noise artifacts introduced\nduring the imaging process.Effective denoising is critical for enhancing image\nquality while preserving anatomical structures. However, traditional denoising\nmethods, which often assume uniform noise distributions, struggle to handle the\nnon-uniform noise commonly present in MRI images. In this paper, we introduce a\nnovel approach leveraging a sparse mixture-of-experts framework for MRI image\ndenoising. Each expert is a specialized denoising convolutional neural network\nfine-tuned to target specific noise characteristics associated with different\nimage regions. Our method demonstrates superior performance over\nstate-of-the-art denoising techniques on both synthetic and real-world brain\nMRI datasets. Furthermore, we show that it generalizes effectively to unseen\ndatasets, highlighting its robustness and adaptability.\n","authors":["Zeyun Deng","Joseph Campbell"],"pdf_url":"https://arxiv.org/pdf/2501.14198v1.pdf","comment":"Accepted to the WACV Workshop on Image Quality"},{"id":"http://arxiv.org/abs/2501.14195v1","updated":"2025-01-24T02:57:09Z","published":"2025-01-24T02:57:09Z","title":"VideoShield: Regulating Diffusion-based Video Generation Models via\n Watermarking","summary":" Artificial Intelligence Generated Content (AIGC) has advanced significantly,\nparticularly with the development of video generation models such as\ntext-to-video (T2V) models and image-to-video (I2V) models. However, like other\nAIGC types, video generation requires robust content control. A common approach\nis to embed watermarks, but most research has focused on images, with limited\nattention given to videos. Traditional methods, which embed watermarks\nframe-by-frame in a post-processing manner, often degrade video quality. In\nthis paper, we propose VideoShield, a novel watermarking framework specifically\ndesigned for popular diffusion-based video generation models. Unlike\npost-processing methods, VideoShield embeds watermarks directly during video\ngeneration, eliminating the need for additional training. To ensure video\nintegrity, we introduce a tamper localization feature that can detect changes\nboth temporally (across frames) and spatially (within individual frames). Our\nmethod maps watermark bits to template bits, which are then used to generate\nwatermarked noise during the denoising process. Using DDIM Inversion, we can\nreverse the video to its original watermarked noise, enabling straightforward\nwatermark extraction. Additionally, template bits allow precise detection for\npotential temporal and spatial modification. Extensive experiments across\nvarious video models (both T2V and I2V models) demonstrate that our method\neffectively extracts watermarks and detects tamper without compromising video\nquality. Furthermore, we show that this approach is applicable to image\ngeneration models, enabling tamper detection in generated images as well. Codes\nand models are available at\n\\href{https://github.com/hurunyi/VideoShield}{https://github.com/hurunyi/VideoShield}.\n","authors":["Runyi Hu","Jie Zhang","Yiming Li","Jiwei Li","Qing Guo","Han Qiu","Tianwei Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.14195v1.pdf","comment":"International Conference on Learning Representations (ICLR) 2025"},{"id":"http://arxiv.org/abs/2501.14194v1","updated":"2025-01-24T02:56:59Z","published":"2025-01-24T02:56:59Z","title":"ENTER: Event Based Interpretable Reasoning for VideoQA","summary":" In this paper, we present ENTER, an interpretable Video Question Answering\n(VideoQA) system based on event graphs. Event graphs convert videos into\ngraphical representations, where video events form the nodes and event-event\nrelationships (temporal/causal/hierarchical) form the edges. This structured\nrepresentation offers many benefits: 1) Interpretable VideoQA via generated\ncode that parses event-graph; 2) Incorporation of contextual visual information\nin the reasoning process (code generation) via event graphs; 3) Robust VideoQA\nvia Hierarchical Iterative Update of the event graphs. Existing interpretable\nVideoQA systems are often top-down, disregarding low-level visual information\nin the reasoning plan generation, and are brittle. While bottom-up approaches\nproduce responses from visual data, they lack interpretability. Experimental\nresults on NExT-QA, IntentQA, and EgoSchema demonstrate that not only does our\nmethod outperform existing top-down approaches while obtaining competitive\nperformance against bottom-up approaches, but more importantly, offers superior\ninterpretability and explainability in the reasoning process.\n","authors":["Hammad Ayyubi","Junzhang Liu","Ali Asgarov","Zaber Ibn Abdul Hakim","Najibul Haque Sarker","Zhecan Wang","Chia-Wei Tang","Hani Alomari","Md. Atabuzzaman","Xudong Lin","Naveen Reddy Dyava","Shih-Fu Chang","Chris Thomas"],"pdf_url":"https://arxiv.org/pdf/2501.14194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14190v1","updated":"2025-01-24T02:53:59Z","published":"2025-01-24T02:53:59Z","title":"High-Precision Fabric Defect Detection via Adaptive Shape Convolutions\n and Large Kernel Spatial Modeling","summary":" Detecting fabric defects in the textile industry remains a challenging task\ndue to the diverse and complex nature of defect patterns. Traditional methods\noften suffer from slow inference speeds, limited accuracy, and inadequate\nrecognition rates, particularly in scenarios involving intricate or subtle\ndefects. To overcome these limitations, we introduce Fab-ASLKS, an advanced\nfabric defect detection framework built upon the YOLOv8s architecture.\nFab-ASLKS incorporates two key modules: (1) the Adaptive Shape Convolution\nModule (ASCM), which leverages adaptive shape convolution within the Neck to\nenhance feature fusion and improve efficiency by extending the capabilities of\nthe standard C2f structure, and (2) the Large Kernel Shift Convolution Module\n(LKSCM), designed to emulate large kernel effects within the Backbone, enabling\nsuperior spatial information extraction. These modules collaboratively optimize\nfeature extraction and information integration across the network. Extensive\nexperiments conducted on the Tianchi fabric defect detection dataset\ndemonstrate that Fab-ASLKS achieves a 5% improvement in mAP@50 over the\nbaseline, showcasing its capability to deliver high precision and efficiency.\n","authors":["Shuai Wang","Yang Xu","Hui Zheng","Baotian Li"],"pdf_url":"https://arxiv.org/pdf/2501.14190v1.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2412.05074v2","updated":"2025-01-24T02:39:31Z","published":"2024-12-06T14:32:25Z","title":"LoFi: Vision-Aided Label Generator for Wi-Fi Localization and Tracking","summary":" Data-driven Wi-Fi localization and tracking have shown great promise due to\ntheir lower reliance on specialized hardware compared to model-based methods.\nHowever, most existing data collection techniques provide only coarse-grained\nground truth or a limited number of labeled points, significantly hindering the\nadvancement of data-driven approaches. While systems like lidar can deliver\nprecise ground truth, their high costs make them inaccessible to many users. To\naddress these challenges, we propose LoFi, a vision-aided label generator for\nWi-Fi localization and tracking. LoFi can generate ground truth position\ncoordinates solely from 2D images, offering high precision, low cost, and ease\nof use. Utilizing our method, we have compiled a Wi-Fi tracking and\nlocalization dataset using the ESP32-S3 and a webcam, which will be\nopen-sourced along with the code upon publication.\n","authors":["Zijian Zhao","Tingwei Chen","Fanyi Meng","Zhijie Cai","Hang Li","Xiaoyang Li","Guangxu Zhu"],"pdf_url":"https://arxiv.org/pdf/2412.05074v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14182v1","updated":"2025-01-24T02:22:42Z","published":"2025-01-24T02:22:42Z","title":"Post-hoc Spurious Correlation Neutralization with Single-Weight\n Fictitious Class Unlearning","summary":" Neural network training tends to exploit the simplest features as shortcuts\nto greedily minimize training loss. However, some of these features might be\nspuriously correlated with the target labels, leading to incorrect predictions\nby the model. Several methods have been proposed to address this issue.\nFocusing on suppressing the spurious correlations with model training, they not\nonly incur additional training cost, but also have limited practical utility as\nthe model misbehavior due to spurious relations is usually discovered after its\ndeployment. It is also often overlooked that spuriousness is a subjective\nnotion. Hence, the precise questions that must be investigated are; to what\ndegree a feature is spurious, and how we can proportionally distract the\nmodel's attention from it for reliable prediction. To this end, we propose a\nmethod that enables post-hoc neutralization of spurious feature impact,\ncontrollable to an arbitrary degree. We conceptualize spurious features as\nfictitious sub-classes within the original classes, which can be eliminated by\na class removal scheme. We then propose a unique precise class removal\ntechnique that employs a single-weight modification, which entails negligible\nperformance compromise for the remaining classes. We perform extensive\nexperiments, demonstrating that by editing just a single weight in a post-hoc\nmanner, our method achieves highly competitive, or better performance against\nthe state-of-the-art methods.\n","authors":["Shahin Hakemi","Naveed Akhtar","Ghulam Mubashar Hassan","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2501.14182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14318v4","updated":"2025-01-24T01:53:58Z","published":"2024-05-23T08:43:09Z","title":"Adaptive Retention & Correction: Test-Time Training for Continual\n Learning","summary":" Continual learning, also known as lifelong learning or incremental learning,\nrefers to the process by which a model learns from a stream of incoming data\nover time. A common problem in continual learning is the classification layer's\nbias towards the most recent task. Traditionally, methods have relied on\nincorporating data from past tasks during training to mitigate this issue.\nHowever, the recent shift in continual learning to memory-free environments has\nrendered these approaches infeasible. In this study, we propose a solution\nfocused on the testing phase. We first introduce a simple Out-of-Task Detection\nmethod, OTD, designed to accurately identify samples from past tasks during\ntesting. Leveraging OTD, we then propose: (1) an Adaptive Retention mechanism\nfor dynamically tuning the classifier layer on past task data; (2) an Adaptive\nCorrection mechanism for revising predictions when the model classifies data\nfrom previous tasks into classes from the current task. We name our approach\nAdaptive Retention & Correction (ARC). While designed for memory-free\nenvironments, ARC also proves effective in memory-based settings. Extensive\nexperiments show that our proposed method can be plugged in to virtually any\nexisting continual learning approach without requiring any modifications to its\ntraining procedure. Specifically, when integrated with state-of-the-art\napproaches, ARC achieves an average performance increase of 2.7% and 2.6% on\nthe CIFAR-100 and Imagenet-R datasets, respectively.\n","authors":["Haoran Chen","Micah Goldblum","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2405.14318v4.pdf","comment":"Accepted to ICLR 2025"},{"id":"http://arxiv.org/abs/2501.14174v1","updated":"2025-01-24T01:50:19Z","published":"2025-01-24T01:50:19Z","title":"Dreamweaver: Learning Compositional World Representations from Pixels","summary":" Humans have an innate ability to decompose their perceptions of the world\ninto objects and their attributes, such as colors, shapes, and movement\npatterns. This cognitive process enables us to imagine novel futures by\nrecombining familiar concepts. However, replicating this ability in artificial\nintelligence systems has proven challenging, particularly when it comes to\nmodeling videos into compositional concepts and generating unseen, recomposed\nfutures without relying on auxiliary data, such as text, masks, or bounding\nboxes. In this paper, we propose Dreamweaver, a neural architecture designed to\ndiscover hierarchical and compositional representations from raw videos and\ngenerate compositional future simulations. Our approach leverages a novel\nRecurrent Block-Slot Unit (RBSU) to decompose videos into their constituent\nobjects and attributes. In addition, Dreamweaver uses a multi-future-frame\nprediction objective to capture disentangled representations for dynamic\nconcepts more effectively as well as static concepts. In experiments, we\ndemonstrate our model outperforms current state-of-the-art baselines for world\nmodeling when evaluated under the DCI framework across multiple datasets.\nFurthermore, we show how the modularized concept representations of our model\nenable compositional imagination, allowing the generation of novel videos by\nrecombining attributes from different objects.\n","authors":["Junyeob Baek","Yi-Fu Wu","Gautam Singh","Sungjin Ahn"],"pdf_url":"https://arxiv.org/pdf/2501.14174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14172v1","updated":"2025-01-24T01:44:48Z","published":"2025-01-24T01:44:48Z","title":"UltraLightSqueezeNet: A Deep Learning Architecture for Malaria\n Classification with up to 54x fewer trainable parameters for resource\n constrained devices","summary":" Lightweight deep learning approaches for malaria detection have gained\nattention for their potential to enhance diagnostics in resource constrained\nenvironments. For our study, we selected SqueezeNet1.1 as it is one of the most\npopular lightweight architectures. SqueezeNet1.1 is a later version of\nSqueezeNet1.0 and is 2.4 times more computationally efficient than the original\nmodel. We proposed and implemented three ultra-lightweight architecture\nvariants to SqueezeNet1.1 architecture, namely Variant 1 (one fire module),\nVariant 2 (two fire modules), and Variant 3 (four fire modules), which are even\nmore compact than SqueezeNetV1.1 (eight fire modules). These models were\nimplemented to evaluate the best performing variant that achieves superior\ncomputational efficiency without sacrificing accuracy in malaria blood cell\nclassification. The models were trained and evaluated using the NIH Malaria\ndataset. We assessed each model's performance based on metrics including\naccuracy, recall, precision, F1-score, and Area Under the Curve (AUC). The\nresults show that the SqueezeNet1.1 model achieves the highest performance\nacross all metrics, with a classification accuracy of 97.12%. Variant 3 (four\nfire modules) offers a competitive alternative, delivering almost identical\nresults (accuracy 96.55%) with a 6x reduction in computational overhead\ncompared to SqueezeNet1.1. Variant 2 and Variant 1 perform slightly lower than\nVariant 3, with Variant 2 (two fire modules) reducing computational overhead by\n28x, and Variant 1 (one fire module) achieving a 54x reduction in trainable\nparameters compared to SqueezeNet1.1. These findings demonstrate that our\nSqueezeNet1.1 architecture variants provide a flexible approach to malaria\ndetection, enabling the selection of a variant that balances resource\nconstraints and performance.\n","authors":["Suresh Babu Nettur","Shanthi Karpurapu","Unnati Nettur","Likhit Sagar Gajja","Sravanthy Myneni","Akhil Dusi","Lalithya Posham"],"pdf_url":"https://arxiv.org/pdf/2501.14172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14171v1","updated":"2025-01-24T01:40:16Z","published":"2025-01-24T01:40:16Z","title":"Fully Guided Neural Schrödinger bridge for Brain MR image synthesis","summary":" Multi-modal brain MRI provides essential complementary information for\nclinical diagnosis. However, acquiring all modalities is often challenging due\nto time and cost constraints. To address this, various methods have been\nproposed to generate missing modalities from available ones. Traditional\napproaches can be broadly categorized into two main types: paired and unpaired\nmethods. While paired methods offer superior performance, obtaining large-scale\npaired datasets is challenging in real-world scenarios. Conversely, unpaired\nmethods facilitate large-scale data collection but struggle to preserve\ncritical image features, such as tumors. In this paper, we propose Fully Guided\nSchr\\\"odinger Bridges (FGSB), a novel framework based on Neural Schr\\\"odinger\nBridges, to overcome these limitations. FGSB achieves stable, high-quality\ngeneration of missing modalities using minimal paired data. Furthermore, when\nprovided with ground truth or a segmentation network for specific regions, FGSB\ncan generate missing modalities while preserving these critical areas with\nreduced data requirements. Our proposed model consists of two consecutive\nphases. 1) Generation Phase: Fuses a generated image, a paired reference image,\nand Gaussian noise, employing iterative refinement to mitigate issues such as\nmode collapse and improve generation quality 2) Training Phase: Learns the\nmapping from the generated image to the target modality. Experiments\ndemonstrate that FGSB achieves comparable generation performance to methods\ntrained on large datasets, while using data from only two subjects. Moreover,\nthe utilization of lesion information with FGSB significantly enhances its\nability to preserve crucial lesion features.\n","authors":["Hanyeol Yang","Sunggyu Kim","Yongseon Yoo","Jong-min Lee"],"pdf_url":"https://arxiv.org/pdf/2501.14171v1.pdf","comment":"9 pages,4 figures"},{"id":"http://arxiv.org/abs/2501.14166v1","updated":"2025-01-24T01:35:10Z","published":"2025-01-24T01:35:10Z","title":"Enhancing Multimodal Entity Linking with Jaccard Distance-based\n Conditional Contrastive Learning and Contextual Visual Augmentation","summary":" Previous research on multimodal entity linking (MEL) has primarily employed\ncontrastive learning as the primary objective. However, using the rest of the\nbatch as negative samples without careful consideration, these studies risk\nleveraging easy features and potentially overlook essential details that make\nentities unique. In this work, we propose JD-CCL (Jaccard Distance-based\nConditional Contrastive Learning), a novel approach designed to enhance the\nability to match multimodal entity linking models. JD-CCL leverages\nmeta-information to select negative samples with similar attributes, making the\nlinking task more challenging and robust. Additionally, to address the\nlimitations caused by the variations within the visual modality among mentions\nand entities, we introduce a novel method, CVaCPT (Contextual Visual-aid\nControllable Patch Transform). It enhances visual representations by\nincorporating multi-view synthetic images and contextual textual\nrepresentations to scale and shift patch representations. Experimental results\non benchmark MEL datasets demonstrate the strong effectiveness of our approach.\n","authors":["Cong-Duy Nguyen","Xiaobao Wu","Thong Nguyen","Shuai Zhao","Khoi Le","Viet-Anh Nguyen","Feng Yichao","Anh Tuan Luu"],"pdf_url":"https://arxiv.org/pdf/2501.14166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14158v1","updated":"2025-01-24T01:07:58Z","published":"2025-01-24T01:07:58Z","title":"Advancing MRI Reconstruction: A Systematic Review of Deep Learning and\n Compressed Sensing Integration","summary":" Magnetic resonance imaging (MRI) is a non-invasive imaging modality and\nprovides comprehensive anatomical and functional insights into the human body.\nHowever, its long acquisition times can lead to patient discomfort, motion\nartifacts, and limiting real-time applications. To address these challenges,\nstrategies such as parallel imaging have been applied, which utilize multiple\nreceiver coils to speed up the data acquisition process. Additionally,\ncompressed sensing (CS) is a method that facilitates image reconstruction from\nsparse data, significantly reducing image acquisition time by minimizing the\namount of data collection needed. Recently, deep learning (DL) has emerged as a\npowerful tool for improving MRI reconstruction. It has been integrated with\nparallel imaging and CS principles to achieve faster and more accurate MRI\nreconstructions. This review comprehensively examines DL-based techniques for\nMRI reconstruction. We categorize and discuss various DL-based methods,\nincluding end-to-end approaches, unrolled optimization, and federated learning,\nhighlighting their potential benefits. Our systematic review highlights\nsignificant contributions and underscores the potential of DL in MRI\nreconstruction. Additionally, we summarize key results and trends in DL-based\nMRI reconstruction, including quantitative metrics, the dataset, acceleration\nfactors, and the progress of and research interest in DL techniques over time.\nFinally, we discuss potential future directions and the importance of DL-based\nMRI reconstruction in advancing medical imaging. To facilitate further research\nin this area, we provide a GitHub repository that includes up-to-date DL-based\nMRI reconstruction publications and public\ndatasets-https://github.com/mosaf/Awesome-DL-based-CS-MRI.\n","authors":["Mojtaba Safari","Zach Eidex","Chih-Wei Chang","Richard L. J. Qiu","Xiaofeng Yang"],"pdf_url":"https://arxiv.org/pdf/2501.14158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14149v1","updated":"2025-01-24T00:33:21Z","published":"2025-01-24T00:33:21Z","title":"Effective Defect Detection Using Instance Segmentation for NDI","summary":" Ultrasonic testing is a common Non-Destructive Inspection (NDI) method used\nin aerospace manufacturing. However, the complexity and size of the ultrasonic\nscans make it challenging to identify defects through visual inspection or\nmachine learning models. Using computer vision techniques to identify defects\nfrom ultrasonic scans is an evolving research area. In this study, we used\ninstance segmentation to identify the presence of defects in the ultrasonic\nscan images of composite panels that are representative of real components\nmanufactured in aerospace. We used two models based on Mask-RCNN (Detectron 2)\nand YOLO 11 respectively. Additionally, we implemented a simple statistical\npre-processing technique that reduces the burden of requiring custom-tailored\npre-processing techniques. Our study demonstrates the feasibility and\neffectiveness of using instance segmentation in the NDI pipeline by\nsignificantly reducing data pre-processing time, inspection time, and overall\ncosts.\n","authors":["Ashiqur Rahman","Venkata Devesh Reddy Seethi","Austin Yunker","Zachary Kral","Rajkumar Kettimuthu","Hamed Alhoori"],"pdf_url":"https://arxiv.org/pdf/2501.14149v1.pdf","comment":"6 pages, 2 figures, 2 tables. Published at AI2ASE 2025 workshop at\n AAAI2025. Accepted publication is available at https://ai-2-ase.github.io/"},{"id":"http://arxiv.org/abs/2501.14148v1","updated":"2025-01-24T00:31:01Z","published":"2025-01-24T00:31:01Z","title":"SelfPrompt: Confidence-Aware Semi-Supervised Tuning for Robust\n Vision-Language Model Adaptation","summary":" We present SelfPrompt, a novel prompt-tuning approach for vision-language\nmodels (VLMs) in a semi-supervised learning setup. Existing methods for tuning\nVLMs in semi-supervised setups struggle with the negative impact of the\nmiscalibrated VLMs on pseudo-labelling, and the accumulation of noisy\npseudo-labels. SelfPrompt addresses these challenges by introducing a\ncluster-guided pseudo-labelling method that improves pseudo-label accuracy, and\na confidence-aware semi-supervised learning module that maximizes the\nutilization of unlabelled data by combining supervised learning and\nweakly-supervised learning. Additionally, we investigate our method in an\nactive semi-supervised learning setup, where the labelled set is strategically\nselected to ensure the best utilization of a limited labelling budget. To this\nend, we propose a weakly-supervised sampling technique that selects a diverse\nand representative labelled set, which can be seamlessly integrated into\nexisting methods to enhance their performance. We conduct extensive evaluations\nacross 13 datasets, significantly surpassing state-of-the-art performances with\naverage improvements of 6.23% in standard semi-supervised learning, 6.25% in\nactive semi-supervised learning, and 4.9% in base-to-novel generalization,\nusing a 2-shot setup. Furthermore, SelfPrompt shows excellent generalization in\nsingle-shot settings, achieving an average improvement of 11.78%.\n","authors":["Shuvendu Roy","Ali Etemad"],"pdf_url":"https://arxiv.org/pdf/2501.14148v1.pdf","comment":null}]},"2025-01-27T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2412.05313v4","updated":"2025-01-27T18:53:40Z","published":"2024-11-28T19:31:50Z","title":"λ: A Benchmark for Data-Efficiency in Long-Horizon Indoor Mobile\n Manipulation Robotics","summary":" Efficiently learning and executing long-horizon mobile manipulation (MoMa)\ntasks is crucial for advancing robotics in household and workplace settings.\nHowever, current MoMa models are data-inefficient, underscoring the need for\nimproved models that require realistic-sized benchmarks to evaluate their\nefficiency, which do not exist. To address this, we introduce the LAMBDA\n({\\lambda}) benchmark (Long-horizon Actions for Mobile-manipulation\nBenchmarking of Directed Activities), which evaluates the data efficiency of\nmodels on language-conditioned, long-horizon, multi-room, multi-floor,\npick-and-place tasks using a dataset of manageable size, more feasible for\ncollection. The benchmark includes 571 human-collected demonstrations that\nprovide realism and diversity in simulated and real-world settings. Unlike\nplanner-generated data, these trajectories offer natural variability and\nreplay-verifiability, ensuring robust learning and evaluation. We benchmark\nseveral models, including learning-based models and a neuro-symbolic modular\napproach combining foundation models with task and motion planning.\nLearning-based models show suboptimal success rates, even when leveraging\npretrained weights, underscoring significant data inefficiencies. However, the\nneuro-symbolic approach performs significantly better while being more data\nefficient. Findings highlight the need for more data-efficient learning-based\nMoMa approaches. {\\lambda} addresses this gap by serving as a key benchmark for\nevaluating the data efficiency of those future models in handling household\nrobotics tasks.\n","authors":["Ahmed Jaafar","Shreyas Sundara Raman","Yichen Wei","Sudarshan Harithas","Sofia Juliani","Anneke Wernerfelt","Benedict Quartey","Ifrah Idrees","Jason Xinyu Liu","Stefanie Tellex"],"pdf_url":"https://arxiv.org/pdf/2412.05313v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.13548v4","updated":"2025-01-27T17:04:37Z","published":"2024-12-18T06:49:46Z","title":"TelePreview: A User-Friendly Teleoperation System with Virtual Arm\n Assistance for Enhanced Effectiveness","summary":" Teleoperation provides an effective way to collect robot data, which is\ncrucial for learning from demonstrations. In this field, teleoperation faces\nseveral key challenges: user-friendliness for new users, safety assurance, and\ntransferability across different platforms. While collecting real robot\ndexterous manipulation data by teleoperation to train robots has shown\nimpressive results on diverse tasks, due to the morphological differences\nbetween human and robot hands, it is not only hard for new users to understand\nthe action mapping but also raises potential safety concerns during operation.\nTo address these limitations, we introduce TelePreview. This teleoperation\nsystem offers real-time visual feedback on robot actions based on human user\ninputs, with a total hardware cost of less than $1,000. TelePreview allows the\nuser to see a virtual robot that represents the outcome of the user's next\nmovement. By enabling flexible switching between command visualization and\nactual execution, this system helps new users learn how to demonstrate quickly\nand safely. We demonstrate that it outperforms other teleoperation systems\nacross five tasks, emphasize its ease of use, and highlight its straightforward\ndeployment across diverse robotic platforms. We release our code and a\ndeployment document on our website https://nus-lins-lab.github.io/telepreview/.\n","authors":["Jingxiang Guo","Jiayu Luo","Zhenyu Wei","Yiwen Hou","Zhixuan Xu","Xiaoyi Lin","Chongkai Gao","Lin Shao"],"pdf_url":"https://arxiv.org/pdf/2412.13548v4.pdf","comment":"In submission"},{"id":"http://arxiv.org/abs/2501.16212v1","updated":"2025-01-27T17:04:12Z","published":"2025-01-27T17:04:12Z","title":"An FPGA-Based Neuro-Fuzzy Sensor for Personalized Driving Assistance","summary":" Advanced driving-assistance systems (ADAS) are intended to automatize driver\ntasks, as well as improve driving and vehicle safety. This work proposes an\nintelligent neuro-fuzzy sensor for driving style (DS) recognition, suitable for\nADAS enhancement. The development of the driving style intelligent sensor uses\nnaturalistic driving data from the SHRP2 study, which includes data from a CAN\nbus, inertial measurement unit, and front radar. The system has been\nsuccessfully implemented using a field-programmable gate array (FPGA) device of\nthe Xilinx Zynq programmable system-on-chip (PSoC). It can mimic the typical\ntiming parameters of a group of drivers as well as tune these typical\nparameters to model individual DSs. The neuro-fuzzy intelligent sensor provides\nhigh-speed real-time active ADAS implementation and is able to personalize its\nbehavior into safe margins without driver intervention. In particular, the\npersonalization procedure of the time headway (THW) parameter for an ACC in\nsteady car following was developed, achieving a performance of 0.53\nmicroseconds. This performance fulfilled the requirements of cutting-edge\nactive ADAS specifications.\n","authors":["Óscar Mata-Carballeira","Jon Gutiérrez-Zaballa","Inés del Campo","Victoria Martínez"],"pdf_url":"https://arxiv.org/pdf/2501.16212v1.pdf","comment":"Journal Article"},{"id":"http://arxiv.org/abs/2501.14486v2","updated":"2025-01-27T15:26:49Z","published":"2025-01-24T13:40:33Z","title":"Visual-Lidar Map Alignment for Infrastructure Inspections","summary":" Routine and repetitive infrastructure inspections present safety, efficiency,\nand consistency challenges as they are performed manually, often in challenging\nor hazardous environments. They can also introduce subjectivity and errors into\nthe process, resulting in undesirable outcomes. Simultaneous localization and\nmapping (SLAM) presents an opportunity to generate high-quality 3D maps that\ncan be used to extract accurate and objective inspection data. Yet, many SLAM\nalgorithms are limited in their ability to align 3D maps from repeated\ninspections in GPS-denied settings automatically. This limitation hinders\npractical long-term asset health assessments by requiring tedious manual\nalignment for data association across scans from previous inspections. This\npaper introduces a versatile map alignment algorithm leveraging both visual and\nlidar data for improved place recognition robustness and presents an\ninfrastructure-focused dataset tailored for consecutive inspections. By\ndetaching map alignment from SLAM, our approach enhances infrastructure\ninspection pipelines, supports monitoring asset degradation over time, and\ninvigorates SLAM research by permitting exploration beyond existing\nmulti-session SLAM algorithms.\n","authors":["Jake McLaughlin","Nicholas Charron","Sriram Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2501.14486v2.pdf","comment":"8 pages, 8 figures, for associated code see\n https://github.com/jakemclaughlin6/vlma"},{"id":"http://arxiv.org/abs/2501.16101v1","updated":"2025-01-27T14:50:19Z","published":"2025-01-27T14:50:19Z","title":"3D Reconstruction of non-visible surfaces of objects from a Single Depth\n View -- Comparative Study","summary":" Scene and object reconstruction is an important problem in robotics, in\nparticular in planning collision-free trajectories or in object manipulation.\nThis paper compares two strategies for the reconstruction of nonvisible parts\nof the object surface from a single RGB-D camera view. The first method, named\nDeepSDF predicts the Signed Distance Transform to the object surface for a\ngiven point in 3D space. The second method, named MirrorNet reconstructs the\noccluded objects' parts by generating images from the other side of the\nobserved object. Experiments performed with objects from the ShapeNet dataset,\nshow that the view-dependent MirrorNet is faster and has smaller reconstruction\nerrors in most categories.\n","authors":["Rafał Staszak","Piotr Michałek","Jakub Chudziński","Marek Kopicki","Dominik Belter"],"pdf_url":"https://arxiv.org/pdf/2501.16101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09372v2","updated":"2025-01-27T13:30:20Z","published":"2024-07-12T15:53:15Z","title":"Segmentation Dataset for Reinforced Concrete Construction","summary":" This paper provides a dataset of 14,805 RGB images with segmentation labels\nfor autonomous robotic inspection of reinforced concrete defects. Baselines for\nthe YOLOv8L-seg, DeepLabV3, and U-Net segmentation models are established.\nLabelling inconsistencies are addressed statistically, and their influence on\nmodel performance is analyzed. An error identification tool is employed to\nexamine the error modes of the models. The paper demonstrates that YOLOv8L-seg\nperforms best, achieving a validation mIOU score of up to 0.59. Label\ninconsistencies were found to have a negligible effect on model performance,\nwhile the inclusion of more data improved the performance. False negatives were\nidentified as the primary failure mode. The results highlight the importance of\ndata availability for the performance of deep learning-based models. The lack\nof publicly available data is identified as a significant contributor to false\nnegatives. To address this, the paper advocates for an increased open-source\napproach within the construction community.\n","authors":["Patrick Schmidt","Lazaros Nalpantidis"],"pdf_url":"https://arxiv.org/pdf/2407.09372v2.pdf","comment":"The ConRebSeg Dataset can be found under the following DOI:\n https://doi.org/10.11583/DTU.26213762 Corresponding code to download\n additional data and initialize the dataset under\n https://github.com/DTU-PAS/ConRebSeg This work is an accepted manuscript up\n for publication in the Elsevier journal \"Automation in Construction\""},{"id":"http://arxiv.org/abs/2501.16006v1","updated":"2025-01-27T12:44:19Z","published":"2025-01-27T12:44:19Z","title":"Underactuated dexterous robotic grasping with reconfigurable passive\n joints","summary":" We introduce a novel reconfigurable passive joint (RP-joint), which has been\nimplemented and tested on an underactuated three-finger robotic gripper.\nRP-joint has no actuation, but instead it is lightweight and compact. It can be\neasily reconfigured by applying external forces and locked to perform complex\ndexterous manipulation tasks, but only after tension is applied to the\nconnected tendon. Additionally, we present an approach that allows learning\ndexterous grasps from single examples with underactuated grippers and\nautomatically configures the RP-joints for dexterous manipulation. This is\nenhanced by integrating kinaesthetic contact optimization, which improves grasp\nperformance even further. The proposed RP-joint gripper and grasp planner have\nbeen tested on over 370 grasps executed on 42 IKEA objects and on the YCB\nobject dataset, achieving grasping success rates of 80% and 87%, on IKEA and\nYCB, respectively.\n","authors":["Marek Kopicki","Sainul Islam Ansary","Simone Tolomei","Franco Angelini","Manolo Garabini","Piotr Skrzypczyński"],"pdf_url":"https://arxiv.org/pdf/2501.16006v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15901v1","updated":"2025-01-27T09:51:48Z","published":"2025-01-27T09:51:48Z","title":"Robust Mobile Robot Path Planning via LLM-Based Dynamic Waypoint\n Generation","summary":" Mobile robot path planning in complex environments remains a significant\nchallenge, especially in achieving efficient, safe and robust paths. The\ntraditional path planning techniques like DRL models typically trained for a\ngiven configuration of the starting point and target positions, these models\nonly perform well when these conditions are satisfied. In this paper, we\nproposed a novel path planning framework that embeds Large Language Models to\nempower mobile robots with the capability of dynamically interpreting natural\nlanguage commands and autonomously generating efficient, collision-free\nnavigation paths. The proposed framework uses LLMs to translate high-level user\ninputs into actionable waypoints while dynamically adjusting paths in response\nto obstacles. We experimentally evaluated our proposed LLM-based approach\nacross three different environments of progressive complexity, showing the\nrobustness of our approach with llama3.1 model that outperformed other LLM\nmodels in path planning time, waypoint generation success rate, and collision\navoidance. This underlines the promising contribution of LLMs for enhancing the\ncapability of mobile robots, especially when their operation involves complex\ndecisions in large and complex environments. Our framework has provided safer,\nmore reliable navigation systems and opened a new direction for the future\nresearch. The source code of this work is publicly available on GitHub.\n","authors":["Muhammad Taha Tariq","Congqing Wang","Yasir Hussain"],"pdf_url":"https://arxiv.org/pdf/2501.15901v1.pdf","comment":"18 pages, 6 figures, submitted in Journal Expert Systems with\n Applications"},{"id":"http://arxiv.org/abs/2411.12308v2","updated":"2025-01-27T09:51:05Z","published":"2024-11-19T07:49:22Z","title":"SNN-Based Online Learning of Concepts and Action Laws in an Open World","summary":" We present the architecture of a fully autonomous, bio-inspired cognitive\nagent built around a spiking neural network (SNN) implementing the agent's\nsemantic memory. The agent explores its universe and learns concepts of\nobjects/situations and of its own actions in a one-shot manner. While\nobject/situation concepts are unary, action concepts are triples made up of an\ninitial situation, a motor activity, and an outcome. They embody the agent's\nknowledge of its universe's actions laws. Both kinds of concepts have different\ndegrees of generality. To make decisions the agent queries its semantic memory\nfor the expected outcomes of envisaged actions and chooses the action to take\non the basis of these predictions. Our experiments show that the agent handles\nnew situations by appealing to previously learned general concepts and rapidly\nmodifies its concepts to adapt to environment changes.\n","authors":["Christel Grimaud","Dominique Longin","Andreas Herzig"],"pdf_url":"https://arxiv.org/pdf/2411.12308v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15850v1","updated":"2025-01-27T08:18:52Z","published":"2025-01-27T08:18:52Z","title":"LLM-attacker: Enhancing Closed-loop Adversarial Scenario Generation for\n Autonomous Driving with Large Language Models","summary":" Ensuring and improving the safety of autonomous driving systems (ADS) is\ncrucial for the deployment of highly automated vehicles, especially in\nsafety-critical events. To address the rarity issue, adversarial scenario\ngeneration methods are developed, in which behaviors of traffic participants\nare manipulated to induce safety-critical events. However, existing methods\nstill face two limitations. First, identification of the adversarial\nparticipant directly impacts the effectiveness of the generation. However, the\ncomplexity of real-world scenarios, with numerous participants and diverse\nbehaviors, makes identification challenging. Second, the potential of generated\nsafety-critical scenarios to continuously improve ADS performance remains\nunderexplored. To address these issues, we propose LLM-attacker: a closed-loop\nadversarial scenario generation framework leveraging large language models\n(LLMs). Specifically, multiple LLM agents are designed and coordinated to\nidentify optimal attackers. Then, the trajectories of the attackers are\noptimized to generate adversarial scenarios. These scenarios are iteratively\nrefined based on the performance of ADS, forming a feedback loop to improve\nADS. Experimental results show that LLM-attacker can create more dangerous\nscenarios than other methods, and the ADS trained with it achieves a collision\nrate half that of training with normal scenarios. This indicates the ability of\nLLM-attacker to test and enhance the safety and robustness of ADS. Video\ndemonstrations are provided at:\nhttps://drive.google.com/file/d/1Zv4V3iG7825oyiKbUwS2Y-rR0DQIE1ZA/view.\n","authors":["Yuewen Mei","Tong Nie","Jian Sun","Ye Tian"],"pdf_url":"https://arxiv.org/pdf/2501.15850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15830v1","updated":"2025-01-27T07:34:33Z","published":"2025-01-27T07:34:33Z","title":"SpatialVLA: Exploring Spatial Representations for Visual-Language-Action\n Model","summary":" In this paper, we claim that spatial understanding is the keypoint in robot\nmanipulation, and propose SpatialVLA to explore effective spatial\nrepresentations for the robot foundation model. Specifically, we introduce\nEgo3D Position Encoding to inject 3D information into the input observations of\nthe visual-language-action model, and propose Adaptive Action Grids to\nrepresent spatial robot movement actions with adaptive discretized action\ngrids, facilitating learning generalizable and transferrable spatial action\nknowledge for cross-robot control. SpatialVLA is first pre-trained on top of a\nvision-language model with 1.1 Million real-world robot episodes, to learn a\ngeneralist manipulation policy across multiple robot environments and tasks.\nAfter pre-training, SpatialVLA is directly applied to perform numerous tasks in\na zero-shot manner. The superior results in both simulation and real-world\nrobots demonstrate its advantage of inferring complex robot motion trajectories\nand its strong in-domain multi-task generalization ability. We further show the\nproposed Adaptive Action Grids offer a new and effective way to fine-tune the\npre-trained SpatialVLA model for new simulation and real-world setups, where\nthe pre-learned action grids are re-discretized to capture robot-specific\nspatial action movements of new setups. The superior results from extensive\nevaluations demonstrate the exceptional in-distribution generalization and\nout-of-distribution adaptation capability, highlighting the crucial benefit of\nthe proposed spatial-aware representations for generalist robot policy\nlearning. All the details and codes will be open-sourced.\n","authors":["Delin Qu","Haoming Song","Qizhi Chen","Yuanqi Yao","Xinyi Ye","Yan Ding","Zhigang Wang","JiaYuan Gu","Bin Zhao","Dong Wang","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2501.15830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15806v1","updated":"2025-01-27T06:28:29Z","published":"2025-01-27T06:28:29Z","title":"Autonomous Horizon-based Asteroid Navigation With\n Observability-constrained Maneuvers","summary":" Asteroid exploration is a pertinent challenge due to the varying complexity\nof their dynamical environments, shape and communication delays due to\ndistance. Thus, autonomous navigation methods are continually being developed\nand improved in current research to enable their safe exploration. These\nmethods often involve using horizon-based Optical Navigation (OpNav) to\ndetermine the spacecraft's location, which is reliant on the visibility of the\nhorizon. It is critical to ensure the reliability of this measurement such that\nthe spacecraft may maintain an accurate state estimate throughout its mission.\nThis paper presents an algorithm that generates control maneuvers for\nspacecraft to follow trajectories that allow continuously usable optical\nmeasurements to maintain system observability for safe navigation. This\nalgorithm improves upon existing asteroid navigation capabilities by allowing\nthe safe and robust autonomous targeting of various trajectories and orbits at\na wide range of distances within optical measurement range. It is adaptable to\ndifferent asteroid scenarios. Overall, the approach develops an\nall-encompassing system that simulates the asteroid dynamics, synthetic image\ngeneration, edge detection, horizon-based OpNav, filtering and\nobservability-enhancing control.\n","authors":["Aditya Arjun Anibha","Kenshiro Oguri"],"pdf_url":"https://arxiv.org/pdf/2501.15806v1.pdf","comment":"38 pages, 16 figures, preprint under journal review"},{"id":"http://arxiv.org/abs/2211.15136v4","updated":"2025-01-27T06:12:13Z","published":"2022-11-28T08:48:58Z","title":"Collective Intelligence for 2D Push Manipulations with Mobile Robots","summary":" While natural systems often present collective intelligence that allows them\nto self-organize and adapt to changes, the equivalent is missing in most\nartificial systems. We explore the possibility of such a system in the context\nof cooperative 2D push manipulations using mobile robots. Although conventional\nworks demonstrate potential solutions for the problem in restricted settings,\nthey have computational and learning difficulties. More importantly, these\nsystems do not possess the ability to adapt when facing environmental changes.\nIn this work, we show that by distilling a planner derived from a\ndifferentiable soft-body physics simulator into an attention-based neural\nnetwork, our multi-robot push manipulation system achieves better performance\nthan baselines. In addition, our system also generalizes to configurations not\nseen during training and is able to adapt toward task completions when external\nturbulence and environmental changes are applied. Supplementary videos can be\nfound on our project website: https://sites.google.com/view/ciom/home\n","authors":["So Kuroki","Tatsuya Matsushima","Jumpei Arima","Hiroki Furuta","Yutaka Matsuo","Shixiang Shane Gu","Yujin Tang"],"pdf_url":"https://arxiv.org/pdf/2211.15136v4.pdf","comment":"Published in IEEE Robotics and Automation Letters (RA-L)"},{"id":"http://arxiv.org/abs/2306.09872v3","updated":"2025-01-27T06:11:58Z","published":"2023-06-14T03:37:55Z","title":"GenORM: Generalizable One-shot Rope Manipulation with Parameter-Aware\n Policy","summary":" Due to the inherent uncertainty in their deformability during motion,\nprevious methods in rope manipulation often require hundreds of real-world\ndemonstrations to train a manipulation policy for each rope, even for simple\ntasks such as rope goal reaching, which hinder their applications in our\never-changing world. To address this issue, we introduce GenORM, a framework\nthat allows the manipulation policy to handle different deformable ropes with a\nsingle real-world demonstration. To achieve this, we augment the policy by\nconditioning it on deformable rope parameters and training it with a diverse\nrange of simulated deformable ropes so that the policy can adjust actions based\non different rope parameters. At the time of inference, given a new rope,\nGenORM estimates the deformable rope parameters by minimizing the disparity\nbetween the grid density of point clouds of real-world demonstrations and\nsimulations. With the help of a differentiable physics simulator, we require\nonly a single real-world demonstration. Empirical validations on both simulated\nand real-world rope manipulation setups clearly show that our method can\nmanipulate different ropes with a single demonstration and significantly\noutperforms the baseline in both environments (62% improvement in in-domain\nropes, and 15% improvement in out-of-distribution ropes in simulation, 26%\nimprovement in real-world), demonstrating the effectiveness of our approach in\none-shot rope manipulation.\n","authors":["So Kuroki","Jiaxian Guo","Tatsuya Matsushima","Takuya Okubo","Masato Kobayashi","Yuya Ikeda","Ryosuke Takanami","Paul Yoo","Yutaka Matsuo","Yusuke Iwasawa"],"pdf_url":"https://arxiv.org/pdf/2306.09872v3.pdf","comment":"The extended version of this paper, GenDOM, was published in the 2024\n IEEE International Conference on Robotics and Automation (ICRA 2024),\n arXiv:2309.09051"},{"id":"http://arxiv.org/abs/2309.09051v4","updated":"2025-01-27T06:08:19Z","published":"2023-09-16T17:18:23Z","title":"GenDOM: Generalizable One-shot Deformable Object Manipulation with\n Parameter-Aware Policy","summary":" Due to the inherent uncertainty in their deformability during motion,\nprevious methods in deformable object manipulation, such as rope and cloth,\noften required hundreds of real-world demonstrations to train a manipulation\npolicy for each object, which hinders their applications in our ever-changing\nworld. To address this issue, we introduce GenDOM, a framework that allows the\nmanipulation policy to handle different deformable objects with only a single\nreal-world demonstration. To achieve this, we augment the policy by\nconditioning it on deformable object parameters and training it with a diverse\nrange of simulated deformable objects so that the policy can adjust actions\nbased on different object parameters. At the time of inference, given a new\nobject, GenDOM can estimate the deformable object parameters with only a single\nreal-world demonstration by minimizing the disparity between the grid density\nof point clouds of real-world demonstrations and simulations in a\ndifferentiable physics simulator. Empirical validations on both simulated and\nreal-world object manipulation setups clearly show that our method can\nmanipulate different objects with a single demonstration and significantly\noutperforms the baseline in both environments (a 62% improvement for in-domain\nropes and a 15% improvement for out-of-distribution ropes in simulation, as\nwell as a 26% improvement for ropes and a 50% improvement for cloths in the\nreal world), demonstrating the effectiveness of our approach in one-shot\ndeformable object manipulation.\n","authors":["So Kuroki","Jiaxian Guo","Tatsuya Matsushima","Takuya Okubo","Masato Kobayashi","Yuya Ikeda","Ryosuke Takanami","Paul Yoo","Yutaka Matsuo","Yusuke Iwasawa"],"pdf_url":"https://arxiv.org/pdf/2309.09051v4.pdf","comment":"Published in the 2024 IEEE International Conference on Robotics and\n Automation (ICRA 2024). arXiv admin note: substantial text overlap with\n arXiv:2306.09872"},{"id":"http://arxiv.org/abs/2312.02008v4","updated":"2025-01-27T06:06:17Z","published":"2023-12-04T16:30:19Z","title":"Multi-Agent Behavior Retrieval: Retrieval-Augmented Policy Training for\n Cooperative Push Manipulation by Mobile Robots","summary":" Due to the complex interactions between agents, learning multi-agent control\npolicy often requires a prohibited amount of data. This paper aims to enable\nmulti-agent systems to effectively utilize past memories to adapt to novel\ncollaborative tasks in a data-efficient fashion. We propose the Multi-Agent\nCoordination Skill Database, a repository for storing a collection of\ncoordinated behaviors associated with key vectors distinctive to them. Our\nTransformer-based skill encoder effectively captures spatio-temporal\ninteractions that contribute to coordination and provides a unique skill\nrepresentation for each coordinated behavior. By leveraging only a small number\nof demonstrations of the target task, the database enables us to train the\npolicy using a dataset augmented with the retrieved demonstrations.\nExperimental evaluations demonstrate that our method achieves a significantly\nhigher success rate in push manipulation tasks compared with baseline methods\nlike few-shot imitation learning. Furthermore, we validate the effectiveness of\nour retrieve-and-learn framework in a real environment using a team of wheeled\nrobots.\n","authors":["So Kuroki","Mai Nishimura","Tadashi Kozuno"],"pdf_url":"https://arxiv.org/pdf/2312.02008v4.pdf","comment":"Published in the 2024 IEEE/RSJ International Conference on\n Intelligent Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2501.14616v2","updated":"2025-01-27T05:44:42Z","published":"2025-01-24T16:33:56Z","title":"QuIP: Experimental design for expensive simulators with many Qualitative\n factors via Integer Programming","summary":" The need to explore and/or optimize expensive simulators with many\nqualitative factors arises in broad scientific and engineering problems. Our\nmotivating application lies in path planning - the exploration of feasible\npaths for navigation, which plays an important role in robotics, surgical\nplanning and assembly planning. Here, the feasibility of a path is evaluated\nvia expensive virtual experiments, and its parameter space is typically\ndiscrete and high-dimensional. A carefully selected experimental design is thus\nessential for timely decision-making. We propose here a novel framework, called\nQuIP, for experimental design of Qualitative factors via Integer Programming\nunder a Gaussian process surrogate model with an exchangeable covariance\nfunction. For initial design, we show that its asymptotic D-optimal design can\nbe formulated as a variant of the well-known assignment problem in operations\nresearch, which can be efficiently solved to global optimality using\nstate-of-the-art integer programming solvers. For sequential design\n(specifically, for active learning or black-box optimization), we show that its\ndesign criterion can similarly be formulated as an assignment problem, thus\nenabling efficient and reliable optimization with existing solvers. We then\ndemonstrate the effectiveness of QuIP over existing methods in a suite of path\nplanning experiments and an application to rover trajectory optimization.\n","authors":["Yen-Chun Liu","Simon Mak"],"pdf_url":"https://arxiv.org/pdf/2501.14616v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15768v1","updated":"2025-01-27T04:34:37Z","published":"2025-01-27T04:34:37Z","title":"Error-State LQR Formulation for Quadrotor UAV Trajectory Tracking","summary":" This article presents an error-state Linear Quadratic Regulator (LQR)\nformulation for robust trajectory tracking in quadrotor Unmanned Aerial\nVehicles (UAVs). The proposed approach leverages error-state dynamics and\nemploys exponential coordinates to represent orientation errors, enabling a\nlinearized system representation for real-time control. The control strategy\nintegrates an LQR-based full-state feedback controller for trajectory tracking,\ncombined with a cascaded bodyrate controller to handle actuator dynamics.\nDetailed derivations of the error-state dynamics, the linearization process,\nand the controller design are provided, highlighting the applicability of the\nmethod for precise and stable quadrotor control in dynamic environments.\n","authors":["Micah Reich"],"pdf_url":"https://arxiv.org/pdf/2501.15768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16539v1","updated":"2025-01-27T22:20:48Z","published":"2025-01-27T22:20:48Z","title":"Generalized Mission Planning for Heterogeneous Multi-Robot Teams via\n LLM-constructed Hierarchical Trees","summary":" We present a novel mission-planning strategy for heterogeneous multi-robot\nteams, taking into account the specific constraints and capabilities of each\nrobot. Our approach employs hierarchical trees to systematically break down\ncomplex missions into manageable sub-tasks. We develop specialized APIs and\ntools, which are utilized by Large Language Models (LLMs) to efficiently\nconstruct these hierarchical trees. Once the hierarchical tree is generated, it\nis further decomposed to create optimized schedules for each robot, ensuring\nadherence to their individual constraints and capabilities. We demonstrate the\neffectiveness of our framework through detailed examples covering a wide range\nof missions, showcasing its flexibility and scalability.\n","authors":["Piyush Gupta","David Isele","Enna Sachdeva","Pin-Hao Huang","Behzad Dariush","Kwonjoon Lee","Sangjae Bae"],"pdf_url":"https://arxiv.org/pdf/2501.16539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14554v3","updated":"2025-01-27T21:07:49Z","published":"2024-01-25T22:49:13Z","title":"GCBF+: A Neural Graph Control Barrier Function Framework for Distributed\n Safe Multi-Agent Control","summary":" Distributed, scalable, and safe control of large-scale multi-agent systems is\na challenging problem. In this paper, we design a distributed framework for\nsafe multi-agent control in large-scale environments with obstacles, where a\nlarge number of agents are required to maintain safety using only local\ninformation and reach their goal locations. We introduce a new class of\ncertificates, termed graph control barrier function (GCBF), which are based on\nthe well-established control barrier function theory for safety guarantees and\nutilize a graph structure for scalable and generalizable distributed control of\nMAS. We develop a novel theoretical framework to prove the safety of an\narbitrary-sized MAS with a single GCBF. We propose a new training framework\nGCBF+ that uses graph neural networks to parameterize a candidate GCBF and a\ndistributed control policy. The proposed framework is distributed and is\ncapable of taking point clouds from LiDAR, instead of actual state information,\nfor real-world robotic applications. We illustrate the efficacy of the proposed\nmethod through various hardware experiments on a swarm of drones with\nobjectives ranging from exchanging positions to docking on a moving target\nwithout collision. Additionally, we perform extensive numerical experiments,\nwhere the number and density of agents, as well as the number of obstacles,\nincrease. Empirical results show that in complex environments with agents with\nnonlinear dynamics (e.g., Crazyflie drones), GCBF+ outperforms the hand-crafted\nCBF-based method with the best performance by up to 20% for relatively\nsmall-scale MAS with up to 256 agents, and leading reinforcement learning (RL)\nmethods by up to 40% for MAS with 1024 agents. Furthermore, the proposed method\ndoes not compromise on the performance, in terms of goal reaching, for\nachieving high safety rates, which is a common trade-off in RL-based methods.\n","authors":["Songyuan Zhang","Oswin So","Kunal Garg","Chuchu Fan"],"pdf_url":"https://arxiv.org/pdf/2401.14554v3.pdf","comment":"20 pages, 15 figures; Accepted by IEEE Transactions on Robotics\n (T-RO)"},{"id":"http://arxiv.org/abs/2501.16485v1","updated":"2025-01-27T20:41:38Z","published":"2025-01-27T20:41:38Z","title":"Enhanced Position Estimation in Tactile Internet-Enabled Remote Robotic\n Surgery Using MOESP-Based Kalman Filter","summary":" Accurately estimating the position of a patient's side robotic arm in real\ntime during remote surgery is a significant challenge, especially within\nTactile Internet (TI) environments. This paper presents a new and efficient\nmethod for position estimation using a Kalman Filter (KF) combined with the\nMultivariable Output-Error State Space (MOESP) method for system\nidentification. Unlike traditional approaches that require prior knowledge of\nthe system's dynamics, this study uses the JIGSAW dataset, a comprehensive\ncollection of robotic surgical data, along with input from the Master Tool\nManipulator (MTM) to derive the state-space model directly. The MOESP method\nallows accurate modeling of the Patient Side Manipulator (PSM) dynamics without\nprior system models, improving the KF's performance under simulated network\nconditions, including delays, jitter, and packet loss. These conditions mimic\nreal-world challenges in Tactile Internet applications. The findings\ndemonstrate the KF's improved resilience and accuracy in state estimation,\nachieving over 95 percent accuracy despite network-induced uncertainties.\n","authors":["Muhammad Hanif Lashari","Wafa Batayneh","Ashfaq Khokhar","Shakil Ahmed"],"pdf_url":"https://arxiv.org/pdf/2501.16485v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2406.04503"},{"id":"http://arxiv.org/abs/2501.16480v1","updated":"2025-01-27T20:21:18Z","published":"2025-01-27T20:21:18Z","title":"Modular Framework for Uncertainty Prediction in Autonomous Vehicle\n Motion Forecasting within Complex Traffic Scenarios","summary":" We propose a modular modeling framework designed to enhance the capture and\nvalidation of uncertainty in autonomous vehicle (AV) trajectory prediction.\nDeparting from traditional deterministic methods, our approach employs a\nflexible, end-to-end differentiable probabilistic encoder-decoder architecture.\nThis modular design allows the encoder and decoder to be trained independently,\nenabling seamless adaptation to diverse traffic scenarios without retraining\nthe entire system. Our key contributions include: (1) a probabilistic heatmap\npredictor that generates context-aware occupancy grids for dynamic forecasting,\n(2) a modular training approach that supports independent component training\nand flexible adaptation, and (3) a structured validation scheme leveraging\nuncertainty metrics to evaluate robustness under high-risk conditions. To\nhighlight the benefits of our framework, we benchmark it against an end-to-end\nbaseline, demonstrating faster convergence, improved stability, and\nflexibility. Experimental results validate these advantages, showcasing the\ncapacity of the framework to efficiently handle complex scenarios while\nensuring reliable predictions and robust uncertainty representation. This\nmodular design offers significant practical utility and scalability for\nreal-world autonomous driving applications.\n","authors":["Han Wang","Yuneil Yeo","Antonio R. Paiva","Jean Utke","Maria Laura Delle Monache"],"pdf_url":"https://arxiv.org/pdf/2501.16480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16445v2","updated":"2025-01-27T20:15:08Z","published":"2024-10-21T19:15:41Z","title":"Automated Planning Domain Inference for Task and Motion Planning","summary":" Task and motion planning (TAMP) frameworks address long and complex planning\nproblems by integrating high-level task planners with low-level motion\nplanners. However, existing TAMP methods rely heavily on the manual design of\nplanning domains that specify the preconditions and postconditions of all\nhigh-level actions. This paper proposes a method to automate planning domain\ninference from a handful of test-time trajectory demonstrations, reducing the\nreliance on human design. Our approach incorporates a deep learning-based\nestimator that predicts the appropriate components of a domain for a new task\nand a search algorithm that refines this prediction, reducing the size and\nensuring the utility of the inferred domain. Our method is able to generate new\ndomains from minimal demonstrations at test time, enabling robots to handle\ncomplex tasks more efficiently. We demonstrate that our approach outperforms\nbehavior cloning baselines, which directly imitate planner behavior, in terms\nof planning performance and generalization across a variety of tasks.\nAdditionally, our method reduces computational costs and data amount\nrequirements at test time for inferring new planning domains.\n","authors":["Jinbang Huang","Allen Tao","Rozilyn Marco","Miroslav Bogdanovic","Jonathan Kelly","Florian Shkurti"],"pdf_url":"https://arxiv.org/pdf/2410.16445v2.pdf","comment":"Accepted to 2025 International Conference on Robotics and\n Automation(ICRA) 8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.16458v1","updated":"2025-01-27T19:37:18Z","published":"2025-01-27T19:37:18Z","title":"BiFold: Bimanual Cloth Folding with Language Guidance","summary":" Cloth folding is a complex task due to the inevitable self-occlusions of\nclothes, their complicated dynamics, and the disparate materials, geometries,\nand textures that garments can have. In this work, we learn folding actions\nconditioned on text commands. Translating high-level, abstract instructions\ninto precise robotic actions requires sophisticated language understanding and\nmanipulation capabilities. To do that, we leverage a pre-trained\nvision-language model and repurpose it to predict manipulation actions. Our\nmodel, BiFold, can take context into account and achieves state-of-the-art\nperformance on an existing language-conditioned folding benchmark. Given the\nlack of annotated bimanual folding data, we devise a procedure to automatically\nparse actions of a simulated dataset and tag them with aligned text\ninstructions. BiFold attains the best performance on our dataset and can\ntransfer to new instructions, garments, and environments.\n","authors":["Oriol Barbany","Adrià Colomé","Carme Torras"],"pdf_url":"https://arxiv.org/pdf/2501.16458v1.pdf","comment":"Accepted at ICRA 2025"},{"id":"http://arxiv.org/abs/2501.16411v1","updated":"2025-01-27T18:59:58Z","published":"2025-01-27T18:59:58Z","title":"PhysBench: Benchmarking and Enhancing Vision-Language Models for\n Physical World Understanding","summary":" Understanding the physical world is a fundamental challenge in embodied AI,\ncritical for enabling agents to perform complex tasks and operate safely in\nreal-world environments. While Vision-Language Models (VLMs) have shown great\npromise in reasoning and task planning for embodied agents, their ability to\ncomprehend physical phenomena remains extremely limited. To close this gap, we\nintroduce PhysBench, a comprehensive benchmark designed to evaluate VLMs'\nphysical world understanding capability across a diverse set of tasks.\nPhysBench contains 100,000 entries of interleaved video-image-text data,\ncategorized into four major domains: physical object properties, physical\nobject relationships, physical scene understanding, and physics-based dynamics,\nfurther divided into 19 subclasses and 8 distinct capability dimensions. Our\nextensive experiments, conducted on 75 representative VLMs, reveal that while\nthese models excel in common-sense reasoning, they struggle with understanding\nthe physical world -- likely due to the absence of physical knowledge in their\ntraining data and the lack of embedded physical priors. To tackle the\nshortfall, we introduce PhysAgent, a novel framework that combines the\ngeneralization strengths of VLMs with the specialized expertise of vision\nmodels, significantly enhancing VLMs' physical understanding across a variety\nof tasks, including an 18.4\\% improvement on GPT-4o. Furthermore, our results\ndemonstrate that enhancing VLMs' physical world understanding capabilities can\nhelp embodied agents such as MOKA. We believe that PhysBench and PhysAgent\noffer valuable insights and contribute to bridging the gap between VLMs and\nphysical world understanding.\n","authors":["Wei Chow","Jiageng Mao","Boyi Li","Daniel Seita","Vitor Guizilini","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2501.16411v1.pdf","comment":"ICLR 2025. Project page: https://physbench.github.io/; Dataset:\n https://huggingface.co/datasets/USC-GVL/PhysBench;"},{"id":"http://arxiv.org/abs/2011.04820v4","updated":"2025-01-27T18:56:16Z","published":"2020-11-09T23:15:31Z","title":"Decentralized Structural-RNN for Robot Crowd Navigation with Deep\n Reinforcement Learning","summary":" Safe and efficient navigation through human crowds is an essential capability\nfor mobile robots. Previous work on robot crowd navigation assumes that the\ndynamics of all agents are known and well-defined. In addition, the performance\nof previous methods deteriorates in partially observable environments and\nenvironments with dense crowds. To tackle these problems, we propose\ndecentralized structural-Recurrent Neural Network (DS-RNN), a novel network\nthat reasons about spatial and temporal relationships for robot decision making\nin crowd navigation. We train our network with model-free deep reinforcement\nlearning without any expert supervision. We demonstrate that our model\noutperforms previous methods in challenging crowd navigation scenarios. We\nsuccessfully transfer the policy learned in the simulator to a real-world\nTurtleBot 2i. For more information, please visit the project website at\nhttps://sites.google.com/view/crowdnav-ds-rnn/home.\n","authors":["Shuijing Liu","Peixin Chang","Weihang Liang","Neeloy Chakraborty","Katherine Driggs-Campbell"],"pdf_url":"https://arxiv.org/pdf/2011.04820v4.pdf","comment":"Published as a conference paper in IEEE International Conference on\n Robotics and Automation (ICRA), 2021"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.16330v1","updated":"2025-01-27T18:59:57Z","published":"2025-01-27T18:59:57Z","title":"RelightVid: Temporal-Consistent Diffusion Model for Video Relighting","summary":" Diffusion models have demonstrated remarkable success in image generation and\nediting, with recent advancements enabling albedo-preserving image relighting.\nHowever, applying these models to video relighting remains challenging due to\nthe lack of paired video relighting datasets and the high demands for output\nfidelity and temporal consistency, further complicated by the inherent\nrandomness of diffusion models. To address these challenges, we introduce\nRelightVid, a flexible framework for video relighting that can accept\nbackground video, text prompts, or environment maps as relighting conditions.\nTrained on in-the-wild videos with carefully designed illumination\naugmentations and rendered videos under extreme dynamic lighting, RelightVid\nachieves arbitrary video relighting with high temporal consistency without\nintrinsic decomposition while preserving the illumination priors of its image\nbackbone.\n","authors":["Ye Fang","Zeyi Sun","Shangzhan Zhang","Tong Wu","Yinghao Xu","Pan Zhang","Jiaqi Wang","Gordon Wetzstein","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2501.16330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13896v2","updated":"2025-01-27T18:58:42Z","published":"2025-01-23T18:16:21Z","title":"GUI-Bee: Align GUI Action Grounding to Novel Environments via Autonomous\n Exploration","summary":" Graphical User Interface (GUI) action grounding is a critical step in GUI\nautomation that maps language instructions to actionable elements on GUI\nscreens. Most recent works of GUI action grounding leverage large GUI datasets\nto fine-tune MLLMs. However, the fine-tuning data always covers limited GUI\nenvironments, and we find the performance of the resulting model deteriorates\nin novel environments. We argue that the GUI grounding models should be further\naligned to the novel environments to reveal their full potential, when the\ninference is known to involve novel environments, i.e., environments not used\nduring the previous fine-tuning. To realize this, we first propose GUI-Bee, an\nMLLM-based autonomous agent, to collect high-quality, environment-specific data\nthrough exploration and then continuously fine-tune GUI grounding models with\nthe collected data. Our agent leverages a novel Q-value-Incentive In-Context\nReinforcement Learning (Q-ICRL) method to optimize exploration efficiency and\ndata quality. Additionally, we introduce NovelScreenSpot, a benchmark for\ntesting how well the data can help align GUI action grounding models to novel\nenvironments and demonstrate the effectiveness of data collected by GUI-Bee in\nthe experiments. Furthermore, we conduct an ablation study to validate the\nQ-ICRL method in enhancing the efficiency of GUI-Bee. Project page:\nhttps://gui-bee.github.io\n","authors":["Yue Fan","Handong Zhao","Ruiyi Zhang","Yu Shen","Xin Eric Wang","Gang Wu"],"pdf_url":"https://arxiv.org/pdf/2501.13896v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16319v1","updated":"2025-01-27T18:55:21Z","published":"2025-01-27T18:55:21Z","title":"Adaptive Iterative Compression for High-Resolution Files: an Approach\n Focused on Preserving Visual Quality in Cinematic Workflows","summary":" This study presents an iterative adaptive compression model for\nhigh-resolution DPX-derived TIFF files used in cinematographic workflows and\ndigital preservation. The model employs SSIM and PSNR metrics to dynamically\nadjust compression parameters across three configurations (C0, C1, C2),\nachieving storage reductions up to 83.4 % while maintaining high visual\nfidelity (SSIM > 0.95). Validation across three diverse productions - black and\nwhite classic, soft-palette drama, and complex action film - demonstrated the\nmethod's effectiveness in preserving critical visual elements while\nsignificantly reducing storage requirements. Professional evaluators reported\n90% acceptance rate for the optimal C1 configuration, with artifacts remaining\nbelow perceptual threshold in critical areas. Comparative analysis with\nJPEG2000 and H.265 showed superior quality preservation at equivalent\ncompression rates, particularly for high bit-depth content. While requiring\nadditional computational overhead, the method's storage benefits and quality\ncontrol capabilities make it suitable for professional workflows, with\npotential applications in medical imaging and cloud storage optimization.\n","authors":["Leonardo Melo","Filipe Litaiff"],"pdf_url":"https://arxiv.org/pdf/2501.16319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16312v1","updated":"2025-01-27T18:49:38Z","published":"2025-01-27T18:49:38Z","title":"LinPrim: Linear Primitives for Differentiable Volumetric Rendering","summary":" Volumetric rendering has become central to modern novel view synthesis\nmethods, which use differentiable rendering to optimize 3D scene\nrepresentations directly from observed views. While many recent works build on\nNeRF or 3D Gaussians, we explore an alternative volumetric scene\nrepresentation. More specifically, we introduce two new scene representations\nbased on linear primitives-octahedra and tetrahedra-both of which define\nhomogeneous volumes bounded by triangular faces. This formulation aligns\nnaturally with standard mesh-based tools, minimizing overhead for downstream\napplications. To optimize these primitives, we present a differentiable\nrasterizer that runs efficiently on GPUs, allowing end-to-end gradient-based\noptimization while maintaining realtime rendering capabilities. Through\nexperiments on real-world datasets, we demonstrate comparable performance to\nstate-of-the-art volumetric methods while requiring fewer primitives to achieve\nsimilar reconstruction fidelity. Our findings provide insights into the\ngeometry of volumetric rendering and suggest that adopting explicit polyhedra\ncan expand the design space of scene representations.\n","authors":["Nicolas von Lützow","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2501.16312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15585v4","updated":"2025-01-27T18:46:41Z","published":"2024-03-22T19:19:51Z","title":"MedPromptX: Grounded Multimodal Prompting for Chest X-ray Diagnosis","summary":" Chest X-ray images are commonly used for predicting acute and chronic\ncardiopulmonary conditions, but efforts to integrate them with structured\nclinical data face challenges due to incomplete electronic health records\n(EHR). This paper introduces MedPromptX, the first clinical decision support\nsystem that integrates multimodal large language models (MLLMs), few-shot\nprompting (FP) and visual grounding (VG) to combine imagery with EHR data for\nchest X-ray diagnosis. A pre-trained MLLM is utilized to complement the missing\nEHR information, providing a comprehensive understanding of patients' medical\nhistory. Additionally, FP reduces the necessity for extensive training of MLLMs\nwhile effectively tackling the issue of hallucination. Nevertheless, the\nprocess of determining the optimal number of few-shot examples and selecting\nhigh-quality candidates can be burdensome, yet it profoundly influences model\nperformance. Hence, we propose a new technique that dynamically refines\nfew-shot data for real-time adjustment to new patient scenarios. Moreover, VG\nnarrows the search area in X-ray images, thereby enhancing the identification\nof abnormalities. We also release MedPromptX-VQA, a new in-context visual\nquestion answering dataset encompassing interleaved images and EHR data derived\nfrom MIMIC-IV and MIMIC-CXR-JPG databases. Results demonstrate the SOTA\nperformance of MedPromptX, achieving an 11% improvement in F1-score compared to\nthe baselines. Code and data are publicly available on\nhttps://github.com/BioMedIA-MBZUAI/MedPromptX.\n","authors":["Mai A. Shaaban","Adnan Khan","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.15585v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16300v1","updated":"2025-01-27T18:38:36Z","published":"2025-01-27T18:38:36Z","title":"Large Models in Dialogue for Active Perception and Anomaly Detection","summary":" Autonomous aerial monitoring is an important task aimed at gathering\ninformation from areas that may not be easily accessible by humans. At the same\ntime, this task often requires recognizing anomalies from a significant\ndistance or not previously encountered in the past. In this paper, we propose a\nnovel framework that leverages the advanced capabilities provided by Large\nLanguage Models (LLMs) to actively collect information and perform anomaly\ndetection in novel scenes. To this end, we propose an LLM based model dialogue\napproach, in which two deep learning models engage in a dialogue to actively\ncontrol a drone to increase perception and anomaly detection accuracy. We\nconduct our experiments in a high fidelity simulation environment where an LLM\nis provided with a predetermined set of natural language movement commands\nmapped into executable code functions. Additionally, we deploy a multimodal\nVisual Question Answering (VQA) model charged with the task of visual question\nanswering and captioning. By engaging the two models in conversation, the LLM\nasks exploratory questions while simultaneously flying a drone into different\nparts of the scene, providing a novel way to implement active perception. By\nleveraging LLMs reasoning ability, we output an improved detailed description\nof the scene going beyond existing static perception approaches. In addition to\ninformation gathering, our approach is utilized for anomaly detection and our\nresults demonstrate the proposed methods effectiveness in informing and\nalerting about potential hazards.\n","authors":["Tzoulio Chamiti","Nikolaos Passalis","Anastasios Tefas"],"pdf_url":"https://arxiv.org/pdf/2501.16300v1.pdf","comment":"Accepted to International Conference of Pattern Recognition (ICPR\n 2024)"},{"id":"http://arxiv.org/abs/2501.16297v1","updated":"2025-01-27T18:36:10Z","published":"2025-01-27T18:36:10Z","title":"FALCON: Resolving Visual Redundancy and Fragmentation in High-resolution\n Multimodal Large Language Models via Visual Registers","summary":" The incorporation of high-resolution visual input equips multimodal large\nlanguage models (MLLMs) with enhanced visual perception capabilities for\nreal-world tasks. However, most existing high-resolution MLLMs rely on a\ncropping-based approach to process images, which leads to fragmented visual\nencoding and a sharp increase in redundant tokens. To tackle these issues, we\npropose the FALCON model. FALCON introduces a novel visual register technique\nto simultaneously: 1) Eliminate redundant tokens at the stage of visual\nencoding. To directly address the visual redundancy present in the output of\nvision encoder, we propose a Register-based Representation Compacting\n(ReCompact) mechanism. This mechanism introduces a set of learnable visual\nregisters designed to adaptively aggregate essential information while\ndiscarding redundancy. It enables the encoder to produce a more compact visual\nrepresentation with a minimal number of output tokens, thus eliminating the\nneed for an additional compression module. 2) Ensure continuity in visual\nencoding. To address the potential encoding errors caused by fragmented visual\ninputs, we develop a Register Interactive Attention (ReAtten) module. This\nmodule facilitates effective and efficient information exchange across\nsub-images by enabling interactions between visual registers. It ensures the\ncontinuity of visual semantics throughout the encoding. We conduct\ncomprehensive experiments with FALCON on high-resolution benchmarks across a\nwide range of scenarios. FALCON demonstrates superior performance with a\nremarkable 9-fold and 16-fold reduction in visual tokens.\n","authors":["Renshan Zhang","Rui Shao","Gongwei Chen","Kaiwen Zhou","Weili Guan","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2501.16297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16295v1","updated":"2025-01-27T18:35:05Z","published":"2025-01-27T18:35:05Z","title":"Mixture-of-Mamba: Enhancing Multi-Modal State-Space Models with\n Modality-Aware Sparsity","summary":" State Space Models (SSMs) have emerged as efficient alternatives to\nTransformers for sequential modeling, but their inability to leverage\nmodality-specific features limits their performance in multi-modal pretraining.\nHere, we propose Mixture-of-Mamba, a novel SSM architecture that introduces\nmodality-aware sparsity through modality-specific parameterization of the Mamba\nblock. Building on Mixture-of-Transformers (W. Liang et al. arXiv:2411.04996;\n2024), we extend the benefits of modality-aware sparsity to SSMs while\npreserving their computational efficiency. We evaluate Mixture-of-Mamba across\nthree multi-modal pretraining settings: Transfusion (interleaved text and\ncontinuous image tokens with diffusion loss), Chameleon (interleaved text and\ndiscrete image tokens), and an extended three-modality framework incorporating\nspeech. Mixture-of-Mamba consistently reaches the same loss values at earlier\ntraining steps with significantly reduced computational costs. In the\nTransfusion setting, Mixture-of-Mamba achieves equivalent image loss using only\n34.76% of the training FLOPs at the 1.4B scale. In the Chameleon setting,\nMixture-of-Mamba reaches similar image loss with just 42.50% of the FLOPs at\nthe 1.4B scale, and similar text loss with just 65.40% of the FLOPs. In the\nthree-modality setting, MoM matches speech loss at 24.80% of the FLOPs at the\n1.4B scale. Our ablation study highlights the synergistic effects of decoupling\nprojection components, where joint decoupling yields greater gains than\nindividual modifications. These results establish modality-aware sparsity as a\nversatile and effective design principle, extending its impact from\nTransformers to SSMs and setting new benchmarks in multi-modal pretraining. Our\ncode can be accessed at https://github.com/Weixin-Liang/Mixture-of-Mamba\n","authors":["Weixin Liang","Junhong Shen","Genghan Zhang","Ning Dong","Luke Zettlemoyer","Lili Yu"],"pdf_url":"https://arxiv.org/pdf/2501.16295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16289v1","updated":"2025-01-27T18:25:35Z","published":"2025-01-27T18:25:35Z","title":"Multi-view Structural Convolution Network for Domain-Invariant Point\n Cloud Recognition of Autonomous Vehicles","summary":" Point cloud representation has recently become a research hotspot in the\nfield of computer vision and has been utilized for autonomous vehicles.\nHowever, adapting deep learning networks for point cloud data recognition is\nchallenging due to the variability in datasets and sensor technologies. This\nvariability underscores the necessity for adaptive techniques to maintain\naccuracy under different conditions. In this paper, we present the Multi-View\nStructural Convolution Network (MSCN) designed for domain-invariant point cloud\nrecognition. MSCN comprises Structural Convolution Layers (SCL) that extract\nlocal context geometric features from point clouds and Structural Aggregation\nLayers (SAL) that extract and aggregate both local and overall context features\nfrom point clouds. Additionally, our MSCN enhances feature representation\nrobustness by training with unseen domain point clouds derived from source\ndomain point clouds. This method acquires domain-invariant features and\nexhibits robust, consistent performance across various point cloud datasets,\nensuring compatibility with diverse sensor configurations without the need for\nparameter adjustments. This highlights MSCN's potential to significantly\nimprove the reliability and domain invariant features in different\nenvironments. Our code is available at https://github.com/MLMLab/MSCN.\n","authors":["Younggun Kim","Beomsik Cho","Seonghoon Ryoo","Soomok Lee"],"pdf_url":"https://arxiv.org/pdf/2501.16289v1.pdf","comment":"16 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.05731v2","updated":"2025-01-27T18:21:19Z","published":"2024-11-08T17:42:02Z","title":"PEP-GS: Perceptually-Enhanced Precise Structured 3D Gaussians for\n View-Adaptive Rendering","summary":" Recently, 3D Gaussian Splatting (3D-GS) has achieved significant success in\nreal-time, high-quality 3D scene rendering. However, it faces several\nchallenges, including Gaussian redundancy, limited ability to capture\nview-dependent effects, and difficulties in handling complex lighting and\nspecular reflections. Additionally, methods that use spherical harmonics for\ncolor representation often struggle to effectively capture specular highlights\nand anisotropic components, especially when modeling view-dependent colors\nunder complex lighting conditions, leading to insufficient contrast and\nunnatural color saturation. To address these limitations, we introduce PEP-GS,\na perceptually-enhanced framework that dynamically predicts Gaussian\nattributes, including opacity, color, and covariance. We replace traditional\nspherical harmonics with a Hierarchical Granular-Structural Attention\nmechanism, which enables more accurate modeling of complex view-dependent color\neffects and specular highlights. By employing a stable and interpretable\nframework for opacity and covariance estimation, PEP-GS avoids the removal of\nessential Gaussians prematurely, ensuring a more accurate scene representation.\nFurthermore, perceptual optimization is applied to the final rendered images,\nenhancing perceptual consistency across different views and ensuring\nhigh-quality renderings with improved texture fidelity and fine-scale detail\npreservation. Experimental results demonstrate that PEP-GS outperforms\nstate-of-the-art methods, particularly in challenging scenarios involving\nview-dependent effects, specular reflections, and fine-scale details.\n","authors":["Junxi Jin","Xiulai Li","Haiping Huang","Lianjun Liu","Yujie Sun","Boyi Liu"],"pdf_url":"https://arxiv.org/pdf/2411.05731v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16282v1","updated":"2025-01-27T18:20:49Z","published":"2025-01-27T18:20:49Z","title":"Brain-Adapter: Enhancing Neurological Disorder Analysis with\n Adapter-Tuning Multimodal Large Language Models","summary":" Understanding brain disorders is crucial for accurate clinical diagnosis and\ntreatment. Recent advances in Multimodal Large Language Models (MLLMs) offer a\npromising approach to interpreting medical images with the support of text\ndescriptions. However, previous research has primarily focused on 2D medical\nimages, leaving richer spatial information of 3D images under-explored, and\nsingle-modality-based methods are limited by overlooking the critical clinical\ninformation contained in other modalities. To address this issue, this paper\nproposes Brain-Adapter, a novel approach that incorporates an extra bottleneck\nlayer to learn new knowledge and instill it into the original pre-trained\nknowledge. The major idea is to incorporate a lightweight bottleneck layer to\ntrain fewer parameters while capturing essential information and utilize a\nContrastive Language-Image Pre-training (CLIP) strategy to align multimodal\ndata within a unified representation space. Extensive experiments demonstrated\nthe effectiveness of our approach in integrating multimodal data to\nsignificantly improve the diagnosis accuracy without high computational costs,\nhighlighting the potential to enhance real-world diagnostic workflows.\n","authors":["Jing Zhang","Xiaowei Yu","Yanjun Lyu","Lu Zhang","Tong Chen","Chao Cao","Yan Zhuang","Minheng Chen","Tianming Liu","Dajiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.16282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00958v3","updated":"2025-01-27T18:17:26Z","published":"2025-01-01T21:29:37Z","title":"2.5 Years in Class: A Multimodal Textbook for Vision-Language\n Pretraining","summary":" Compared to image-text pair data, interleaved corpora enable Vision-Language\nModels (VLMs) to understand the world more naturally like humans. However, such\nexisting datasets are crawled from webpage, facing challenges like low\nknowledge density, loose image-text relations, and poor logical coherence\nbetween images. On the other hand, the internet hosts vast instructional videos\n(e.g., online geometry courses) that are widely used by humans to learn\nfoundational subjects, yet these valuable resources remain underexplored in VLM\ntraining. In this paper, we introduce a high-quality \\textbf{multimodal\ntextbook} corpus with richer foundational knowledge for VLM pretraining. It\ncollects over 2.5 years of instructional videos, totaling 22,000 class hours.\nWe first use an LLM-proposed taxonomy to systematically gather instructional\nvideos. Then we progressively extract and refine visual (keyframes), audio\n(ASR), and textual knowledge (OCR) from the videos, and organize as an\nimage-text interleaved corpus based on temporal order. Compared to its\ncounterparts, our video-centric textbook offers more coherent context, richer\nknowledge, and better image-text alignment. Experiments demonstrate its superb\npretraining performance, particularly in knowledge- and reasoning-intensive\ntasks like ScienceQA and MathVista. Moreover, VLMs pre-trained on our textbook\nexhibit outstanding interleaved context awareness, leveraging visual and\ntextual cues in their few-shot context for task solving. Our code are available\nat https://github.com/DAMO-NLP-SG/multimodal_textbook.\n","authors":["Wenqi Zhang","Hang Zhang","Xin Li","Jiashuo Sun","Yongliang Shen","Weiming Lu","Deli Zhao","Yueting Zhuang","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2501.00958v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2501.16273v1","updated":"2025-01-27T18:06:36Z","published":"2025-01-27T18:06:36Z","title":"Return of the Encoder: Maximizing Parameter Efficiency for SLMs","summary":" The dominance of large decoder-only language models has overshadowed\nencoder-decoder architectures, despite their fundamental efficiency advantages\nin sequence processing. For small language models (SLMs) - those with 1 billion\nparameters or fewer - our systematic analysis across GPU, CPU, and NPU\nplatforms reveals that encoder-decoder architectures achieve 47% lower\nfirst-token latency and 4.7x higher throughput compared to decoder-only models\non edge devices. These gains may be attributed to encoder-decoder's one-time\ninput processing and efficient separation of understanding and generation\nphases.\n We introduce a novel knowledge distillation framework that enables\nencoder-decoder models to leverage capabilities from large scalable\ndecoder-only teachers while preserving their architectural advantages,\nachieving up to 6 average performance points improvement across diverse tasks,\nwith significant gains in asymmetric sequence tasks where input and output\ndistributions can benefit from different processing approaches.\n When combined with modern advances like Rotary Positional Embeddings (RoPE)\nand Vision encoders, our systematic investigation demonstrates that\nencoder-decoder architectures provide a more practical path toward deploying\ncapable language models in resource-constrained environments. Our findings\nchallenge the prevailing trend toward decoder-only scaling, showing that\narchitectural choices become increasingly crucial as parameter budgets\ndecrease, particularly for on-device and edge deployments where computational\nefficiency is paramount.\n","authors":["Mohamed Elfeki","Rui Liu","Chad Voegele"],"pdf_url":"https://arxiv.org/pdf/2501.16273v1.pdf","comment":"13 pages, 5 figures. LLMs/SLMs, encoder-decoder and decoder-only"},{"id":"http://arxiv.org/abs/2501.16249v1","updated":"2025-01-27T17:51:29Z","published":"2025-01-27T17:51:29Z","title":"Lightweight Weighted Average Ensemble Model for Pneumonia Detection in\n Chest X-Ray Images","summary":" Pneumonia is a leading cause of illness and death in children, underscoring\nthe need for early and accurate detection. In this study, we propose a novel\nlightweight ensemble model for detecting pneumonia in children using chest\nX-ray images. This ensemble model integrates two pre-trained convolutional\nneural networks (CNNs), MobileNetV2 and NASNetMobile, selected for their\nbalance of computational efficiency and accuracy. These models were fine-tuned\non a pediatric chest X-ray dataset and combined to enhance classification\nperformance. Our proposed ensemble model achieved a classification accuracy of\n98.63%, significantly outperforming individual models such as MobileNetV2\n(97.10%) and NASNetMobile(96.25%) in terms of accuracy, precision, recall, and\nF1 score. Moreover, the ensemble model outperformed state-of-the-art\narchitectures, including ResNet50, InceptionV3, and DenseNet201, while\nmaintaining computational efficiency. The proposed lightweight ensemble model\npresents a highly effective and resource-efficient solution for pneumonia\ndetection, making it particularly suitable for deployment in\nresource-constrained settings.\n","authors":["Suresh Babu Nettur","Shanthi Karpurapu","Unnati Nettur","Likhit Sagar Gajja","Sravanthy Myneni","Akhil Dusi","Lalithya Posham"],"pdf_url":"https://arxiv.org/pdf/2501.16249v1.pdf","comment":"Corresponding authors: Shanthi Karpurapu\n (shanthi.karpurapu@gmail.com), Suresh Babu Nettur (nettursuresh@gmail.com)"},{"id":"http://arxiv.org/abs/2501.16246v1","updated":"2025-01-27T17:43:51Z","published":"2025-01-27T17:43:51Z","title":"CLISC: Bridging clip and sam by enhanced cam for unsupervised brain\n tumor segmentation","summary":" Brain tumor segmentation is important for diagnosis of the tumor, and current\ndeep-learning methods rely on a large set of annotated images for training,\nwith high annotation costs. Unsupervised segmentation is promising to avoid\nhuman annotations while the performance is often limited. In this study, we\npresent a novel unsupervised segmentation approach that leverages the\ncapabilities of foundation models, and it consists of three main steps: (1) A\nvision-language model (i.e., CLIP) is employed to obtain image-level\npseudo-labels for training a classification network. Class Activation Mapping\n(CAM) is then employed to extract Regions of Interest (ROIs), where an adaptive\nmasking-based data augmentation is used to enhance ROI identification.(2) The\nROIs are used to generate bounding box and point prompts for the Segment\nAnything Model (SAM) to obtain segmentation pseudo-labels. (3) A 3D\nsegmentation network is trained with the SAM-derived pseudo-labels, where\nlow-quality pseudo-labels are filtered out in a self-learning process based on\nthe similarity between the SAM's output and the network's prediction.\nEvaluation on the BraTS2020 dataset demonstrates that our approach obtained an\naverage Dice Similarity Score (DSC) of 85.60%, outperforming five\nstate-of-the-art unsupervised segmentation methods by more than 10 percentage\npoints. Besides, our approach outperforms directly using SAM for zero-shot\ninference, and its performance is close to fully supervised learning.\n","authors":["Xiaochuan Ma","Jia Fu","Wenjun Liao","Shichuan Zhang","Guotai Wang"],"pdf_url":"https://arxiv.org/pdf/2501.16246v1.pdf","comment":"22st IEEE International Symposium on Biomedical Imaging (ISBI 2025)"},{"id":"http://arxiv.org/abs/2411.16027v2","updated":"2025-01-27T17:43:42Z","published":"2024-11-25T01:01:54Z","title":"From Dashcam Videos to Driving Simulations: Stress Testing Automated\n Vehicles against Rare Events","summary":" Testing Automated Driving Systems (ADS) in simulation with realistic driving\nscenarios is important for verifying their performance. However, converting\nreal-world driving videos into simulation scenarios is a significant challenge\ndue to the complexity of interpreting high-dimensional video data and the\ntime-consuming nature of precise manual scenario reconstruction. In this work,\nwe propose a novel framework that automates the conversion of real-world car\ncrash videos into detailed simulation scenarios for ADS testing. Our approach\nleverages prompt-engineered Video Language Models(VLM) to transform dashcam\nfootage into SCENIC scripts, which define the environment and driving behaviors\nin the CARLA simulator, enabling the generation of realistic simulation\nscenarios. Importantly, rather than solely aiming for one-to-one scenario\nreconstruction, our framework focuses on capturing the essential driving\nbehaviors from the original video while offering flexibility in parameters such\nas weather or road conditions to facilitate search-based testing. Additionally,\nwe introduce a similarity metric that helps iteratively refine the generated\nscenario through feedback by comparing key features of driving behaviors\nbetween the real and simulated videos. Our preliminary results demonstrate\nsubstantial time efficiency, finishing the real-to-sim conversion in minutes\nwith full automation and no human intervention, while maintaining high fidelity\nto the original driving events.\n","authors":["Yan Miao","Georgios Fainekos","Bardh Hoxha","Hideki Okamoto","Danil Prokhorov","Sayan Mitra"],"pdf_url":"https://arxiv.org/pdf/2411.16027v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16239v1","updated":"2025-01-27T17:35:39Z","published":"2025-01-27T17:35:39Z","title":"Distilling foundation models for robust and efficient models in digital\n pathology","summary":" In recent years, the advent of foundation models (FM) for digital pathology\nhas relied heavily on scaling the pre-training datasets and the model size,\nyielding large and powerful models. While it resulted in improving the\nperformance on diverse downstream tasks, it also introduced increased\ncomputational cost and inference time. In this work, we explore the\ndistillation of a large foundation model into a smaller one, reducing the\nnumber of parameters by several orders of magnitude. Leveraging distillation\ntechniques, our distilled model, H0-mini, achieves nearly comparable\nperformance to large FMs at a significantly reduced inference cost. It is\nevaluated on several public benchmarks, achieving 3rd place on the HEST\nbenchmark and 5th place on the EVA benchmark. Additionally, a robustness\nanalysis conducted on the PLISM dataset demonstrates that our distilled model\nreaches excellent robustness to variations in staining and scanning conditions,\nsignificantly outperforming other state-of-the art models. This opens new\nperspectives to design lightweight and robust models for digital pathology,\nwithout compromising on performance.\n","authors":["Alexandre Filiot","Nicolas Dop","Oussama Tchita","Auriane Riou","Thomas Peeters","Daria Valter","Marin Scalbert","Charlie Saillard","Geneviève Robin","Antoine Olivier"],"pdf_url":"https://arxiv.org/pdf/2501.16239v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2501.16227v1","updated":"2025-01-27T17:21:33Z","published":"2025-01-27T17:21:33Z","title":"PDC-ViT : Source Camera Identification using Pixel Difference\n Convolution and Vision Transformer","summary":" Source camera identification has emerged as a vital solution to unlock\nincidents involving critical cases like terrorism, violence, and other criminal\nactivities. The ability to trace the origin of an image/video can aid law\nenforcement agencies in gathering evidence and constructing the timeline of\nevents. Moreover, identifying the owner of a certain device narrows down the\narea of search in a criminal investigation where smartphone devices are\ninvolved. This paper proposes a new pixel-based method for source camera\nidentification, integrating Pixel Difference Convolution (PDC) with a Vision\nTransformer network (ViT), and named PDC-ViT. While the PDC acts as the\nbackbone for feature extraction by exploiting Angular PDC (APDC) and Radial PDC\n(RPDC). These techniques enhance the capability to capture subtle variations in\npixel information, which are crucial for distinguishing between different\nsource cameras. The second part of the methodology focuses on classification,\nwhich is based on a Vision Transformer network. Unlike traditional methods that\nutilize image patches directly for training the classification network, the\nproposed approach uniquely inputs PDC features into the Vision Transformer\nnetwork. To demonstrate the effectiveness of the PDC-ViT approach, it has been\nassessed on five different datasets, which include various image contents and\nvideo scenes. The method has also been compared with state-of-the-art source\ncamera identification methods. Experimental results demonstrate the\neffectiveness and superiority of the proposed system in terms of accuracy and\nrobustness when compared to its competitors. For example, our proposed PDC-ViT\nhas achieved an accuracy of 94.30%, 84%, 94.22% and 92.29% using the Vision\ndataset, Daxing dataset, Socrates dataset and QUFVD dataset, respectively.\n","authors":["Omar Elharrouss","Younes Akbari","Noor Almaadeed","Somaya Al-Maadeed","Fouad Khelifi","Ahmed Bouridane"],"pdf_url":"https://arxiv.org/pdf/2501.16227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16222v1","updated":"2025-01-27T17:13:03Z","published":"2025-01-27T17:13:03Z","title":"SPECIAL: Zero-shot Hyperspectral Image Classification With CLIP","summary":" Hyperspectral image (HSI) classification aims at categorizing each pixel in\nan HSI into a specific land cover class, which is crucial for applications like\nremote sensing, environmental monitoring, and agriculture. Although deep\nlearning-based HSI classification methods have achieved significant\nadvancements, existing methods still rely on manually labeled data for\ntraining, which is both time-consuming and labor-intensive.To address this\nlimitation, we introduce a novel zero-shot hyperspectral image classification\nframework based on CLIP (SPECIAL), aiming to eliminate the need for manual\nannotations. The SPECIAL framework consists of two main stages: (1) CLIP-based\npseudo-label generation, and (2) noisy label learning. In the first stage, HSI\nis spectrally interpolated to produce RGB bands. These bands are subsequently\nclassified using CLIP, resulting in noisy pseudo-labels that are accompanied by\nconfidence scores.To improve the quality of these labels, we propose a scaling\nstrategy that fuses predictions from multiple spatial scales. In the second\nstage, spectral information and a label refinement technique are incorporated\nto mitigate label noise and further enhance classification accuracy.\nExperimental results on three benchmark datasets demonstrate that our SPECIAL\noutperforms existing methods in zero-shot HSI classification, showing its\npotential for more practical applications. The code is available at\nhttps://github.com/LiPang/SPECIAL.\n","authors":["Li Pang","Jing Yao","Kaiyu Li","Xiangyong Cao"],"pdf_url":"https://arxiv.org/pdf/2501.16222v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16221v1","updated":"2025-01-27T17:10:33Z","published":"2025-01-27T17:10:33Z","title":"Automatic Calibration of a Multi-Camera System with Limited Overlapping\n Fields of View for 3D Surgical Scene Reconstruction","summary":" Purpose: The purpose of this study is to develop an automated and accurate\nexternal camera calibration method for multi-camera systems used in 3D surgical\nscene reconstruction (3D-SSR), eliminating the need for operator intervention\nor specialized expertise. The method specifically addresses the problem of\nlimited overlapping fields of view caused by significant variations in optical\nzoom levels and camera locations.\n Methods: We contribute a novel, fast, and fully automatic calibration method\nbased on the projection of multi-scale markers (MSMs) using a ceiling-mounted\nprojector. MSMs consist of 2D patterns projected at varying scales, ensuring\naccurate extraction of well distributed point correspondences across\nsignificantly different viewpoints and zoom levels. Validation is performed\nusing both synthetic and real data captured in a mock-up OR, with comparisons\nto traditional manual marker-based methods as well as markerless calibration\nmethods.\n Results: The method achieves accuracy comparable to manual,\noperator-dependent calibration methods while exhibiting higher robustness under\nconditions of significant differences in zoom levels. Additionally, we show\nthat state-of-the-art Structure-from-Motion (SfM) pipelines are ineffective in\n3D-SSR settings, even when additional texture is projected onto the OR floor.\n Conclusion: The use of a ceiling-mounted entry-level projector proves to be\nan effective alternative to operator-dependent, traditional marker-based\nmethods, paving the way for fully automated 3D-SSR.\n","authors":["Tim Flückiger","Jonas Hein","Valery Fischer","Philipp Fürnstahl","Lilian Calvet"],"pdf_url":"https://arxiv.org/pdf/2501.16221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16211v1","updated":"2025-01-27T17:01:45Z","published":"2025-01-27T17:01:45Z","title":"UDBE: Unsupervised Diffusion-based Brightness Enhancement in Underwater\n Images","summary":" Activities in underwater environments are paramount in several scenarios,\nwhich drives the continuous development of underwater image enhancement\ntechniques. A major challenge in this domain is the depth at which images are\ncaptured, with increasing depth resulting in a darker environment. Most\nexisting methods for underwater image enhancement focus on noise removal and\ncolor adjustment, with few works dedicated to brightness enhancement. This work\nintroduces a novel unsupervised learning approach to underwater image\nenhancement using a diffusion model. Our method, called UDBE, is based on\nconditional diffusion to maintain the brightness details of the unpaired input\nimages. The input image is combined with a color map and a Signal-Noise\nRelation map (SNR) to ensure stable training and prevent color distortion in\nthe output images. The results demonstrate that our approach achieves an\nimpressive accuracy rate in the datasets UIEB, SUIM and RUIE, well-established\nunderwater image benchmarks. Additionally, the experiments validate the\nrobustness of our approach, regarding the image quality metrics PSNR, SSIM,\nUIQM, and UISM, indicating the good performance of the brightness enhancement\nprocess. The source code is available here: https://github.com/gusanagy/UDBE.\n","authors":["Tatiana Taís Schein","Gustavo Pereira de Almeira","Stephanie Loi Brião","Rodrigo Andrade de Bem","Felipe Gomes de Oliveira","Paulo L. J. Drews-Jr"],"pdf_url":"https://arxiv.org/pdf/2501.16211v1.pdf","comment":"Paper presented at ICMLA 2024"},{"id":"http://arxiv.org/abs/2501.01834v3","updated":"2025-01-27T16:34:59Z","published":"2025-01-03T14:38:01Z","title":"MoColl: Agent-Based Specific and General Model Collaboration for Image\n Captioning","summary":" Image captioning is a critical task at the intersection of computer vision\nand natural language processing, with wide-ranging applications across various\ndomains. For complex tasks such as diagnostic report generation, deep learning\nmodels require not only domain-specific image-caption datasets but also the\nincorporation of relevant general knowledge to provide contextual accuracy.\nExisting approaches exhibit inherent limitations: specialized models excel in\ncapturing domain-specific details but lack generalization, while\nvision-language models (VLMs) built on large language models (LLMs) leverage\ngeneral knowledge but struggle with domain-specific adaptation. To address\nthese limitations, this paper proposes a novel agent-enhanced model\ncollaboration framework, which we call MoColl, designed to effectively\nintegrate domain-specific and general knowledge. Specifically, our approach is\nto decompose complex image captioning tasks into a series of interconnected\nquestion-answer subtasks. A trainable visual question answering (VQA) model is\nemployed as a specialized tool to focus on domain-specific visual analysis,\nanswering task-specific questions based on image content. Concurrently, an\nLLM-based agent with general knowledge formulates these questions and\nsynthesizes the resulting question-answer pairs into coherent captions. Beyond\nits role in leveraging the VQA model, the agent further guides its training to\nenhance its domain-specific capabilities. Experimental results on radiology\nreport generation validate the effectiveness of the proposed framework,\ndemonstrating significant improvements in the quality of generated reports.\n","authors":["Pu Yang","Bin Dong"],"pdf_url":"https://arxiv.org/pdf/2501.01834v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16182v1","updated":"2025-01-27T16:29:17Z","published":"2025-01-27T16:29:17Z","title":"The Linear Attention Resurrection in Vision Transformer","summary":" Vision Transformers (ViTs) have recently taken computer vision by storm.\nHowever, the softmax attention underlying ViTs comes with a quadratic\ncomplexity in time and memory, hindering the application of ViTs to\nhigh-resolution images. We revisit the attention design and propose a linear\nattention method to address the limitation, which doesn't sacrifice ViT's core\nadvantage of capturing global representation like existing methods (e.g. local\nwindow attention of Swin). We further investigate the key difference between\nlinear attention and softmax attention. Our empirical results suggest that\nlinear attention lacks a fundamental property of concentrating the distribution\nof the attention matrix. Inspired by this observation, we introduce a local\nconcentration module to enhance linear attention. By incorporating enhanced\nlinear global attention and local window attention, we propose a new ViT\narchitecture, dubbed L$^2$ViT. Notably, L$^2$ViT can effectively capture both\nglobal interactions and local representations while enjoying linear\ncomputational complexity. Extensive experiments demonstrate the strong\nperformance of L$^2$ViT. On image classification, L$^2$ViT achieves 84.4% Top-1\naccuracy on ImageNet-1K without any extra training data or label. By further\npre-training on ImageNet-22k, it attains 87.0% when fine-tuned with resolution\n384$^2$. For downstream tasks, L$^2$ViT delivers favorable performance as a\nbackbone on object detection as well as semantic segmentation.\n","authors":["Chuanyang Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.16182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09555v2","updated":"2025-01-27T16:28:21Z","published":"2025-01-16T14:18:06Z","title":"Text-driven Adaptation of Foundation Models for Few-shot Surgical\n Workflow Analysis","summary":" Purpose: Surgical workflow analysis is crucial for improving surgical\nefficiency and safety. However, previous studies rely heavily on large-scale\nannotated datasets, posing challenges in cost, scalability, and reliance on\nexpert annotations. To address this, we propose Surg-FTDA (Few-shot Text-driven\nAdaptation), designed to handle various surgical workflow analysis tasks with\nminimal paired image-label data.\n Methods: Our approach has two key components. First, Few-shot selection-based\nmodality alignment selects a small subset of images and aligns their embeddings\nwith text embeddings from the downstream task, bridging the modality gap.\nSecond, Text-driven adaptation leverages only text data to train a decoder,\neliminating the need for paired image-text data. This decoder is then applied\nto aligned image embeddings, enabling image-related tasks without explicit\nimage-text pairs.\n Results: We evaluate our approach to generative tasks (image captioning) and\ndiscriminative tasks (triplet recognition and phase recognition). Results show\nthat Surg-FTDA outperforms baselines and generalizes well across downstream\ntasks.\n Conclusion: We propose a text-driven adaptation approach that mitigates the\nmodality gap and handles multiple downstream tasks in surgical workflow\nanalysis, with minimal reliance on large annotated datasets. The code and\ndataset will be released in https://github.com/CAMMA-public/Surg-FTDA\n","authors":["Tingxuan Chen","Kun Yuan","Vinkle Srivastav","Nassir Navab","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2501.09555v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16177v1","updated":"2025-01-27T16:23:45Z","published":"2025-01-27T16:23:45Z","title":"BAG: Body-Aligned 3D Wearable Asset Generation","summary":" While recent advancements have shown remarkable progress in general 3D shape\ngeneration models, the challenge of leveraging these approaches to\nautomatically generate wearable 3D assets remains unexplored. To this end, we\npresent BAG, a Body-aligned Asset Generation method to output 3D wearable asset\nthat can be automatically dressed on given 3D human bodies. This is achived by\ncontrolling the 3D generation process using human body shape and pose\ninformation. Specifically, we first build a general single-image to consistent\nmultiview image diffusion model, and train it on the large Objaverse dataset to\nachieve diversity and generalizability. Then we train a Controlnet to guide the\nmultiview generator to produce body-aligned multiview images. The control\nsignal utilizes the multiview 2D projections of the target human body, where\npixel values represent the XYZ coordinates of the body surface in a canonical\nspace. The body-conditioned multiview diffusion generates body-aligned\nmultiview images, which are then fed into a native 3D diffusion model to\nproduce the 3D shape of the asset. Finally, by recovering the similarity\ntransformation using multiview silhouette supervision and addressing asset-body\npenetration with physics simulators, the 3D asset can be accurately fitted onto\nthe target human body. Experimental results demonstrate significant advantages\nover existing methods in terms of image prompt-following capability, shape\ndiversity, and shape quality. Our project page is available at\nhttps://bag-3d.github.io/.\n","authors":["Zhongjin Luo","Yang Li","Mingrui Zhang","Senbo Wang","Han Yan","Xibin Song","Taizhang Shang","Wei Mao","Hongdong Li","Xiaoguang Han","Pan Ji"],"pdf_url":"https://arxiv.org/pdf/2501.16177v1.pdf","comment":"video: https://youtu.be/XJtG82LjQKc"},{"id":"http://arxiv.org/abs/2501.09155v2","updated":"2025-01-27T16:05:59Z","published":"2025-01-15T21:14:36Z","title":"VCRScore: Image captioning metric based on V\\&L Transformers, CLIP, and\n precision-recall","summary":" Image captioning has become an essential Vision & Language research task. It\nis about predicting the most accurate caption given a specific image or video.\nThe research community has achieved impressive results by continuously\nproposing new models and approaches to improve the overall model's performance.\nNevertheless, despite increasing proposals, the performance metrics used to\nmeasure their advances have remained practically untouched through the years. A\nprobe of that, nowadays metrics like BLEU, METEOR, CIDEr, and ROUGE are still\nvery used, aside from more sophisticated metrics such as BertScore and\nClipScore.\n Hence, it is essential to adjust how are measure the advances, limitations,\nand scopes of the new image captioning proposals, as well as to adapt new\nmetrics to these new advanced image captioning approaches.\n This work proposes a new evaluation metric for the image captioning problem.\nTo do that, first, it was generated a human-labeled dataset to assess to which\ndegree the captions correlate with the image's content. Taking these human\nscores as ground truth, we propose a new metric, and compare it with several\nwell-known metrics, from classical to newer ones. Outperformed results were\nalso found, and interesting insights were presented and discussed.\n","authors":["Guillermo Ruiz","Tania Ramírez","Daniela Moctezuma"],"pdf_url":"https://arxiv.org/pdf/2501.09155v2.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2405.16343v3","updated":"2025-01-27T15:56:30Z","published":"2024-05-25T20:00:27Z","title":"Learning Point Spread Function Invertibility Assessment for Image\n Deconvolution","summary":" Deep-learning (DL)-based image deconvolution (ID) has exhibited remarkable\nrecovery performance, surpassing traditional linear methods. However, unlike\ntraditional ID approaches that rely on analytical properties of the point\nspread function (PSF) to achieve high recovery performance - such as specific\nspectrum properties or small conditional numbers in the convolution matrix - DL\ntechniques lack quantifiable metrics for evaluating PSF suitability for\nDL-assisted recovery. Aiming to enhance deconvolution quality, we propose a\nmetric that employs a non-linear approach to learn the invertibility of an\narbitrary PSF using a neural network by mapping it to a unit impulse. A lower\ndiscrepancy between the mapped PSF and a unit impulse indicates a higher\nlikelihood of successful inversion by a DL network. Our findings reveal that\nthis metric correlates with high recovery performance in DL and traditional\nmethods, thereby serving as an effective regularizer in deconvolution tasks.\nThis approach reduces the computational complexity over conventional condition\nnumber assessments and is a differentiable process. These useful properties\nallow its application in designing diffractive optical elements through\nend-to-end (E2E) optimization, achieving invertible PSFs, and outperforming the\nE2E baseline framework.\n","authors":["Romario Gualdrón-Hurtado","Roman Jacome","Sergio Urrea","Henry Arguello","Luis Gonzalez"],"pdf_url":"https://arxiv.org/pdf/2405.16343v3.pdf","comment":"Accepted at the 2024 32nd European Signal Processing Conference\n (EUSIPCO), 2024"},{"id":"http://arxiv.org/abs/2501.16147v1","updated":"2025-01-27T15:41:19Z","published":"2025-01-27T15:41:19Z","title":"Efficient Portrait Matte Creation With Layer Diffusion and Connectivity\n Priors","summary":" Learning effective deep portrait matting models requires training data of\nboth high quality and large quantity. Neither quality nor quantity can be\neasily met for portrait matting, however. Since the most accurate ground-truth\nportrait mattes are acquired in front of the green screen, it is almost\nimpossible to harvest a large-scale portrait matting dataset in reality. This\nwork shows that one can leverage text prompts and the recent Layer Diffusion\nmodel to generate high-quality portrait foregrounds and extract latent portrait\nmattes. However, the portrait mattes cannot be readily in use due to\nsignificant generation artifacts. Inspired by the connectivity priors observed\nin portrait images, that is, the border of portrait foregrounds always appears\nconnected, a connectivity-aware approach is introduced to refine portrait\nmattes. Building on this, a large-scale portrait matting dataset is created,\ntermed LD-Portrait-20K, with $20,051$ portrait foregrounds and high-quality\nalpha mattes. Extensive experiments demonstrated the value of the\nLD-Portrait-20K dataset, with models trained on it significantly outperforming\nthose trained on other datasets. In addition, comparisons with the chroma\nkeying algorithm and an ablation study on dataset capacity further confirmed\nthe effectiveness of the proposed matte creation approach. Further, the dataset\nalso contributes to state-of-the-art video portrait matting, implemented by\nsimple video segmentation and a trimap-based image matting model trained on\nthis dataset.\n","authors":["Zhiyuan Lu","Hao Lu","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2501.16147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16146v1","updated":"2025-01-27T15:39:39Z","published":"2025-01-27T15:39:39Z","title":"Toward Efficient Generalization in 3D Human Pose Estimation via a\n Canonical Domain Approach","summary":" Recent advancements in deep learning methods have significantly improved the\nperformance of 3D Human Pose Estimation (HPE). However, performance degradation\ncaused by domain gaps between source and target domains remains a major\nchallenge to generalization, necessitating extensive data augmentation and/or\nfine-tuning for each specific target domain. To address this issue more\nefficiently, we propose a novel canonical domain approach that maps both the\nsource and target domains into a unified canonical domain, alleviating the need\nfor additional fine-tuning in the target domain. To construct the canonical\ndomain, we introduce a canonicalization process to generate a novel canonical\n2D-3D pose mapping that ensures 2D-3D pose consistency and simplifies 2D-3D\npose patterns, enabling more efficient training of lifting networks. The\ncanonicalization of both domains is achieved through the following steps: (1)\nin the source domain, the lifting network is trained within the canonical\ndomain; (2) in the target domain, input 2D poses are canonicalized prior to\ninference by leveraging the properties of perspective projection and known\ncamera intrinsics. Consequently, the trained network can be directly applied to\nthe target domain without requiring additional fine-tuning. Experiments\nconducted with various lifting networks and publicly available datasets (e.g.,\nHuman3.6M, Fit3D, MPI-INF-3DHP) demonstrate that the proposed method\nsubstantially improves generalization capability across datasets while using\nthe same data volume.\n","authors":["Hoosang Lee","Jeha Ryu"],"pdf_url":"https://arxiv.org/pdf/2501.16146v1.pdf","comment":"15 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.13975v2","updated":"2025-01-27T15:20:11Z","published":"2025-01-22T22:28:11Z","title":"3DGS$^2$: Near Second-order Converging 3D Gaussian Splatting","summary":" 3D Gaussian Splatting (3DGS) has emerged as a mainstream solution for novel\nview synthesis and 3D reconstruction. By explicitly encoding a 3D scene using a\ncollection of Gaussian kernels, 3DGS achieves high-quality rendering with\nsuperior efficiency. As a learning-based approach, 3DGS training has been dealt\nwith the standard stochastic gradient descent (SGD) method, which offers at\nmost linear convergence. Consequently, training often requires tens of minutes,\neven with GPU acceleration. This paper introduces a (near) second-order\nconvergent training algorithm for 3DGS, leveraging its unique properties. Our\napproach is inspired by two key observations. First, the attributes of a\nGaussian kernel contribute independently to the image-space loss, which\nendorses isolated and local optimization algorithms. We exploit this by\nsplitting the optimization at the level of individual kernel attributes,\nanalytically constructing small-size Newton systems for each parameter group,\nand efficiently solving these systems on GPU threads. This achieves Newton-like\nconvergence per training image without relying on the global Hessian. Second,\nkernels exhibit sparse and structured coupling across input images. This\nproperty allows us to effectively utilize spatial information to mitigate\novershoot during stochastic training. Our method converges an order faster than\nstandard GPU-based 3DGS training, requiring over $10\\times$ fewer iterations\nwhile maintaining or surpassing the quality of the compared with the SGD-based\n3DGS reconstructions.\n","authors":["Lei Lan","Tianjia Shao","Zixuan Lu","Yu Zhang","Chenfanfu Jiang","Yin Yang"],"pdf_url":"https://arxiv.org/pdf/2501.13975v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2501.16101v1","updated":"2025-01-27T14:50:19Z","published":"2025-01-27T14:50:19Z","title":"3D Reconstruction of non-visible surfaces of objects from a Single Depth\n View -- Comparative Study","summary":" Scene and object reconstruction is an important problem in robotics, in\nparticular in planning collision-free trajectories or in object manipulation.\nThis paper compares two strategies for the reconstruction of nonvisible parts\nof the object surface from a single RGB-D camera view. The first method, named\nDeepSDF predicts the Signed Distance Transform to the object surface for a\ngiven point in 3D space. The second method, named MirrorNet reconstructs the\noccluded objects' parts by generating images from the other side of the\nobserved object. Experiments performed with objects from the ShapeNet dataset,\nshow that the view-dependent MirrorNet is faster and has smaller reconstruction\nerrors in most categories.\n","authors":["Rafał Staszak","Piotr Michałek","Jakub Chudziński","Marek Kopicki","Dominik Belter"],"pdf_url":"https://arxiv.org/pdf/2501.16101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16100v1","updated":"2025-01-27T14:50:13Z","published":"2025-01-27T14:50:13Z","title":"Automated Detection of Sport Highlights from Audio and Video Sources","summary":" This study presents a novel Deep Learning-based and lightweight approach for\nthe automated detection of sports highlights (HLs) from audio and video\nsources. HL detection is a key task in sports video analysis, traditionally\nrequiring significant human effort. Our solution leverages Deep Learning (DL)\nmodels trained on relatively small datasets of audio Mel-spectrograms and\ngrayscale video frames, achieving promising accuracy rates of 89% and 83% for\naudio and video detection, respectively. The use of small datasets, combined\nwith simple architectures, demonstrates the practicality of our method for fast\nand cost-effective deployment. Furthermore, an ensemble model combining both\nmodalities shows improved robustness against false positives and false\nnegatives. The proposed methodology offers a scalable solution for automated HL\ndetection across various types of sports video content, reducing the need for\nmanual intervention. Future work will focus on enhancing model architectures\nand extending this approach to broader scene-detection tasks in media analysis.\n","authors":["Francesco Della Santa","Morgana Lalli"],"pdf_url":"https://arxiv.org/pdf/2501.16100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03800v3","updated":"2025-01-27T14:44:33Z","published":"2025-01-07T14:06:57Z","title":"MADation: Face Morphing Attack Detection with Foundation Models","summary":" Despite the considerable performance improvements of face recognition\nalgorithms in recent years, the same scientific advances responsible for this\nprogress can also be used to create efficient ways to attack them, posing a\nthreat to their secure deployment. Morphing attack detection (MAD) systems aim\nto detect a specific type of threat, morphing attacks, at an early stage,\npreventing them from being considered for verification in critical processes.\nFoundation models (FM) learn from extensive amounts of unlabelled data,\nachieving remarkable zero-shot generalization to unseen domains. Although this\ngeneralization capacity might be weak when dealing with domain-specific\ndownstream tasks such as MAD, FMs can easily adapt to these settings while\nretaining the built-in knowledge acquired during pre-training. In this work, we\nrecognize the potential of FMs to perform well in the MAD task when properly\nadapted to its specificities. To this end, we adapt FM CLIP architectures with\nLoRA weights while simultaneously training a classification header. The\nproposed framework, MADation surpasses our alternative FM and transformer-based\nframeworks and constitutes the first adaption of FMs to the MAD task. MADation\npresents competitive results with current MAD solutions in the literature and\neven surpasses them in several evaluation scenarios. To encourage\nreproducibility and facilitate further research in MAD, we publicly release the\nimplementation of MADation at https://github.com/gurayozgur/MADation\n","authors":["Eduarda Caldeira","Guray Ozgur","Tahar Chettaoui","Marija Ivanovska","Peter Peer","Fadi Boutros","Vitomir Struc","Naser Damer"],"pdf_url":"https://arxiv.org/pdf/2501.03800v3.pdf","comment":"Accepted at WACV 2025 workshops"},{"id":"http://arxiv.org/abs/2501.16085v1","updated":"2025-01-27T14:33:27Z","published":"2025-01-27T14:33:27Z","title":"ARFlow: Autogressive Flow with Hybrid Linear Attention","summary":" Flow models are effective at progressively generating realistic images, but\nthey generally struggle to capture long-range dependencies during the\ngeneration process as they compress all the information from previous time\nsteps into a single corrupted image. To address this limitation, we propose\nintegrating autoregressive modeling -- known for its excellence in modeling\ncomplex, high-dimensional joint probability distributions -- into flow models.\nDuring training, at each step, we construct causally-ordered sequences by\nsampling multiple images from the same semantic category and applying different\nlevels of noise, where images with higher noise levels serve as causal\npredecessors to those with lower noise levels. This design enables the model to\nlearn broader category-level variations while maintaining proper causal\nrelationships in the flow process. During generation, the model\nautoregressively conditions the previously generated images from earlier\ndenoising steps, forming a contextual and coherent generation trajectory.\nAdditionally, we design a customized hybrid linear attention mechanism tailored\nto our modeling approach to enhance computational efficiency. Our approach,\ntermed ARFlow, under 400k training steps, achieves 14.08 FID scores on ImageNet\nat 128 * 128 without classifier-free guidance, reaching 4.34 FID with\nclassifier-free guidance 1.5, significantly outperforming the previous\nflow-based model SiT's 9.17 FID. Extensive ablation studies demonstrate the\neffectiveness of our modeling strategy and chunk-wise attention design.\n","authors":["Mude Hui","Rui-Jie Zhu","Songlin Yang","Yu Zhang","Zirui Wang","Yuyin Zhou","Jason Eshraghian","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2501.16085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16065v1","updated":"2025-01-27T14:08:25Z","published":"2025-01-27T14:08:25Z","title":"CILP-FGDI: Exploiting Vision-Language Model for Generalizable Person\n Re-Identification","summary":" The Visual Language Model, known for its robust cross-modal capabilities, has\nbeen extensively applied in various computer vision tasks. In this paper, we\nexplore the use of CLIP (Contrastive Language-Image Pretraining), a\nvision-language model pretrained on large-scale image-text pairs to align\nvisual and textual features, for acquiring fine-grained and domain-invariant\nrepresentations in generalizable person re-identification. The adaptation of\nCLIP to the task presents two primary challenges: learning more fine-grained\nfeatures to enhance discriminative ability, and learning more domain-invariant\nfeatures to improve the model's generalization capabilities. To mitigate the\nfirst challenge thereby enhance the ability to learn fine-grained features, a\nthree-stage strategy is proposed to boost the accuracy of text descriptions.\nInitially, the image encoder is trained to effectively adapt to person\nre-identification tasks. In the second stage, the features extracted by the\nimage encoder are used to generate textual descriptions (i.e., prompts) for\neach image. Finally, the text encoder with the learned prompts is employed to\nguide the training of the final image encoder. To enhance the model's\ngeneralization capabilities to unseen domains, a bidirectional guiding method\nis introduced to learn domain-invariant image features. Specifically,\ndomain-invariant and domain-relevant prompts are generated, and both positive\n(pulling together image features and domain-invariant prompts) and negative\n(pushing apart image features and domain-relevant prompts) views are used to\ntrain the image encoder. Collectively, these strategies contribute to the\ndevelopment of an innovative CLIP-based framework for learning fine-grained\ngeneralized features in person re-identification.\n","authors":["Huazhong Zhao","Lei Qi","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2501.16065v1.pdf","comment":"Accepted by IEEE TIFS"},{"id":"http://arxiv.org/abs/2410.10791v2","updated":"2025-01-27T13:45:16Z","published":"2024-10-14T17:56:20Z","title":"CAFuser: Condition-Aware Multimodal Fusion for Robust Semantic\n Perception of Driving Scenes","summary":" Leveraging multiple sensors is crucial for robust semantic perception in\nautonomous driving, as each sensor type has complementary strengths and\nweaknesses. However, existing sensor fusion methods often treat sensors\nuniformly across all conditions, leading to suboptimal performance. By\ncontrast, we propose a novel, condition-aware multimodal fusion approach for\nrobust semantic perception of driving scenes. Our method, CAFuser, uses an RGB\ncamera input to classify environmental conditions and generate a Condition\nToken that guides the fusion of multiple sensor modalities. We further newly\nintroduce modality-specific feature adapters to align diverse sensor inputs\ninto a shared latent space, enabling efficient integration with a single and\nshared pre-trained backbone. By dynamically adapting sensor fusion based on the\nactual condition, our model significantly improves robustness and accuracy,\nespecially in adverse-condition scenarios. CAFuser ranks first on the public\nMUSES benchmarks, achieving 59.7 PQ for multimodal panoptic and 78.2 mIoU for\nsemantic segmentation, and also sets the new state of the art on DeLiVER. The\nsource code is publicly available at: https://github.com/timbroed/CAFuser.\n","authors":["Tim Broedermann","Christos Sakaridis","Yuqian Fu","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2410.10791v2.pdf","comment":"IEEE Robotics and Automation Letters, The source code is publicly\n available at: https://github.com/timbroed/CAFuser"},{"id":"http://arxiv.org/abs/2501.16037v1","updated":"2025-01-27T13:32:01Z","published":"2025-01-27T13:32:01Z","title":"Addressing Out-of-Label Hazard Detection in Dashcam Videos: Insights\n from the COOOL Challenge","summary":" This paper presents a novel approach for hazard analysis in dashcam footage,\naddressing the detection of driver reactions to hazards, the identification of\nhazardous objects, and the generation of descriptive captions. We first\nintroduce a method for detecting driver reactions through speed and sound\nanomaly detection, leveraging unsupervised learning techniques. For hazard\ndetection, we employ a set of heuristic rules as weak classifiers, which are\ncombined using an ensemble method. This ensemble approach is further refined\nwith differential privacy to mitigate overconfidence, ensuring robustness\ndespite the lack of labeled data. Lastly, we use state-of-the-art\nvision-language models for hazard captioning, generating descriptive labels for\nthe detected hazards. Our method achieved the highest scores in the Challenge\non Out-of-Label in Autonomous Driving, demonstrating its effectiveness across\nall three tasks. Source codes are publicly available at\nhttps://github.com/ffyyytt/COOOL_2025.\n","authors":["Anh-Kiet Duong","Petra Gomez-Krämer"],"pdf_url":"https://arxiv.org/pdf/2501.16037v1.pdf","comment":"5 pages, WACV 2025"},{"id":"http://arxiv.org/abs/2407.09372v2","updated":"2025-01-27T13:30:20Z","published":"2024-07-12T15:53:15Z","title":"Segmentation Dataset for Reinforced Concrete Construction","summary":" This paper provides a dataset of 14,805 RGB images with segmentation labels\nfor autonomous robotic inspection of reinforced concrete defects. Baselines for\nthe YOLOv8L-seg, DeepLabV3, and U-Net segmentation models are established.\nLabelling inconsistencies are addressed statistically, and their influence on\nmodel performance is analyzed. An error identification tool is employed to\nexamine the error modes of the models. The paper demonstrates that YOLOv8L-seg\nperforms best, achieving a validation mIOU score of up to 0.59. Label\ninconsistencies were found to have a negligible effect on model performance,\nwhile the inclusion of more data improved the performance. False negatives were\nidentified as the primary failure mode. The results highlight the importance of\ndata availability for the performance of deep learning-based models. The lack\nof publicly available data is identified as a significant contributor to false\nnegatives. To address this, the paper advocates for an increased open-source\napproach within the construction community.\n","authors":["Patrick Schmidt","Lazaros Nalpantidis"],"pdf_url":"https://arxiv.org/pdf/2407.09372v2.pdf","comment":"The ConRebSeg Dataset can be found under the following DOI:\n https://doi.org/10.11583/DTU.26213762 Corresponding code to download\n additional data and initialize the dataset under\n https://github.com/DTU-PAS/ConRebSeg This work is an accepted manuscript up\n for publication in the Elsevier journal \"Automation in Construction\""},{"id":"http://arxiv.org/abs/2412.12709v3","updated":"2025-01-27T13:29:05Z","published":"2024-12-17T09:23:46Z","title":"Accelerating lensed quasar discovery and modeling with physics-informed\n variational autoencoders","summary":" Strongly lensed quasars provide valuable insights into the rate of cosmic\nexpansion, the distribution of dark matter in foreground deflectors, and the\ncharacteristics of quasar hosts. However, detecting them in astronomical images\nis difficult due to the prevalence of non-lensing objects. To address this\nchallenge, we developed a generative deep learning model called VariLens, built\nupon a physics-informed variational autoencoder. This model seamlessly\nintegrates three essential modules: image reconstruction, object\nclassification, and lens modeling, offering a fast and comprehensive approach\nto strong lens analysis. VariLens is capable of rapidly determining both (1)\nthe probability that an object is a lens system and (2) key parameters of a\nsingular isothermal ellipsoid (SIE) mass model -- including the Einstein radius\n($\\theta_\\mathrm{E}$), lens center, and ellipticity -- in just milliseconds\nusing a single CPU. A direct comparison of VariLens estimates with traditional\nlens modeling for 20 known lensed quasars within the Subaru Hyper Suprime-Cam\n(HSC) footprint shows good agreement, with both results consistent within\n$2\\sigma$ for systems with $\\theta_\\mathrm{E}<3$ arcsecs. To identify new\nlensed quasar candidates, we begin with an initial sample of approximately 80\nmillion sources, combining HSC data with multiwavelength information from\nvarious surveys. After applying a photometric preselection aimed at locating\n$z>1.5$ sources, the number of candidates was reduced to 710,966. Subsequently,\nVariLens highlights 13,831 sources, each showing a high likelihood of being a\nlens. A visual assessment of these objects results in 42 promising candidates\nthat await spectroscopic confirmation. These results underscore the potential\nof automated deep learning pipelines to efficiently detect and model strong\nlenses in large datasets.\n","authors":["Irham T. Andika","Stefan Schuldt","Sherry H. Suyu","Satadru Bag","Raoul Cañameras","Alejandra Melo","Claudio Grillo","James H. H. Chan"],"pdf_url":"https://arxiv.org/pdf/2412.12709v3.pdf","comment":"Accepted for publication in the Astronomy & Astrophysics journal and\n updated to reflect the revised version. The paper consists of 15 main pages,\n 12 figures, and 1 table. We welcome feedback and comments from readers!"},{"id":"http://arxiv.org/abs/2406.19087v2","updated":"2025-01-27T13:19:27Z","published":"2024-06-27T11:14:14Z","title":"Dimensions underlying the representational alignment of deep neural\n networks with humans","summary":" Determining the similarities and differences between humans and artificial\nintelligence (AI) is an important goal both in computational cognitive\nneuroscience and machine learning, promising a deeper understanding of human\ncognition and safer, more reliable AI systems. Much previous work comparing\nrepresentations in humans and AI has relied on global, scalar measures to\nquantify their alignment. However, without explicit hypotheses, these measures\nonly inform us about the degree of alignment, not the factors that determine\nit. To address this challenge, we propose a generic framework to compare human\nand AI representations, based on identifying latent representational dimensions\nunderlying the same behavior in both domains. Applying this framework to humans\nand a deep neural network (DNN) model of natural images revealed a\nlow-dimensional DNN embedding of both visual and semantic dimensions. In\ncontrast to humans, DNNs exhibited a clear dominance of visual over semantic\nproperties, indicating divergent strategies for representing images. While\nin-silico experiments showed seemingly consistent interpretability of DNN\ndimensions, a direct comparison between human and DNN representations revealed\nsubstantial differences in how they process images. By making representations\ndirectly comparable, our results reveal important challenges for\nrepresentational alignment and offer a means for improving their comparability.\n","authors":["Florian P. Mahner","Lukas Muttenthaler","Umut Güçlü","Martin N. Hebart"],"pdf_url":"https://arxiv.org/pdf/2406.19087v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16022v1","updated":"2025-01-27T13:07:51Z","published":"2025-01-27T13:07:51Z","title":"Freestyle Sketch-in-the-Loop Image Segmentation","summary":" In this paper, we expand the domain of sketch research into the field of\nimage segmentation, aiming to establish freehand sketches as a query modality\nfor subjective image segmentation. Our innovative approach introduces a\n\"sketch-in-the-loop\" image segmentation framework, enabling the segmentation of\nvisual concepts partially, completely, or in groupings - a truly \"freestyle\"\napproach - without the need for a purpose-made dataset (i.e., mask-free). This\nframework capitalises on the synergy between sketch-based image retrieval\n(SBIR) models and large-scale pre-trained models (CLIP or DINOv2). The former\nprovides an effective training signal, while fine-tuned versions of the latter\nexecute the subjective segmentation. Additionally, our purpose-made\naugmentation strategy enhances the versatility of our sketch-guided mask\ngeneration, allowing segmentation at multiple granularity levels. Extensive\nevaluations across diverse benchmark datasets underscore the superior\nperformance of our method in comparison to existing approaches across various\nevaluation scenarios.\n","authors":["Subhadeep Koley","Viswanatha Reddy Gajjala","Aneeshan Sain","Pinaki Nath Chowdhury","Tao Xiang","Ayan Kumar Bhunia","Yi-Zhe Song"],"pdf_url":"https://arxiv.org/pdf/2501.16022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16003v1","updated":"2025-01-27T12:42:20Z","published":"2025-01-27T12:42:20Z","title":"Improving Tropical Cyclone Forecasting With Video Diffusion Models","summary":" Tropical cyclone (TC) forecasting is crucial for disaster preparedness and\nmitigation. While recent deep learning approaches have shown promise, existing\nmethods often treat TC evolution as a series of independent frame-to-frame\npredictions, limiting their ability to capture long-term dynamics. We present a\nnovel application of video diffusion models for TC forecasting that explicitly\nmodels temporal dependencies through additional temporal layers. Our approach\nenables the model to generate multiple frames simultaneously, better capturing\ncyclone evolution patterns. We introduce a two-stage training strategy that\nsignificantly improves individual-frame quality and performance in low-data\nregimes. Experimental results show our method outperforms the previous approach\nof Nath et al. by 19.3% in MAE, 16.2% in PSNR, and 36.1% in SSIM. Most notably,\nwe extend the reliable forecasting horizon from 36 to 50 hours. Through\ncomprehensive evaluation using both traditional metrics and Fr\\'echet Video\nDistance (FVD), we demonstrate that our approach produces more temporally\ncoherent forecasts while maintaining competitive single-frame quality. Code\naccessible at https://github.com/Ren-creater/forecast-video-diffmodels.\n","authors":["Zhibo Ren","Pritthijit Nath","Pancham Shukla"],"pdf_url":"https://arxiv.org/pdf/2501.16003v1.pdf","comment":"7 pages, 7 figures"},{"id":"http://arxiv.org/abs/2410.04636v2","updated":"2025-01-27T12:35:24Z","published":"2024-10-06T21:51:02Z","title":"Multi-Tiered Self-Contrastive Learning for Medical Microwave Radiometry\n (MWR) Breast Cancer Detection","summary":" Improving breast cancer detection and monitoring techniques is a critical\nobjective in healthcare, driving the need for innovative imaging technologies\nand diagnostic approaches. This study introduces a novel multi-tiered\nself-contrastive model tailored for microwave radiometry (MWR) in breast cancer\ndetection. Our approach incorporates three distinct models: Local-MWR (L-MWR),\nRegional-MWR (R-MWR), and Global-MWR (G-MWR), designed to analyze varying\nsub-regional comparisons within the breasts. These models are integrated\nthrough the Joint-MWR (J-MWR) network, which leverages self-contrastive results\nat each analytical level to improve diagnostic accuracy. Utilizing a dataset of\n4,932 female patients, our research demonstrates the efficacy of our proposed\nmodels. Notably, the J-MWR model achieves a Matthew's correlation coefficient\nof 0.74 $\\pm$ 0.018, surpassing existing MWR neural networks and contrastive\nmethods. These findings highlight the potential of self-contrastive learning\ntechniques in improving the diagnostic accuracy and generalizability for\nMWR-based breast cancer detection. This advancement holds considerable promise\nfor future investigations into enabling point-of-care testing. The source code\nis available at: https://github.com/cgalaz01/self_contrastive_mwr.\n","authors":["Christoforos Galazis","Huiyi Wu","Igor Goryanin"],"pdf_url":"https://arxiv.org/pdf/2410.04636v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15998v1","updated":"2025-01-27T12:31:50Z","published":"2025-01-27T12:31:50Z","title":"Controllable Forgetting Mechanism for Few-Shot Class-Incremental\n Learning","summary":" Class-incremental learning in the context of limited personal labeled samples\n(few-shot) is critical for numerous real-world applications, such as smart home\ndevices. A key challenge in these scenarios is balancing the trade-off between\nadapting to new, personalized classes and maintaining the performance of the\nmodel on the original, base classes. Fine-tuning the model on novel classes\noften leads to the phenomenon of catastrophic forgetting, where the accuracy of\nbase classes declines unpredictably and significantly. In this paper, we\npropose a simple yet effective mechanism to address this challenge by\ncontrolling the trade-off between novel and base class accuracy. We\nspecifically target the ultra-low-shot scenario, where only a single example is\navailable per novel class. Our approach introduces a Novel Class Detection\n(NCD) rule, which adjusts the degree of forgetting a priori while\nsimultaneously enhancing performance on novel classes. We demonstrate the\nversatility of our solution by applying it to state-of-the-art Few-Shot\nClass-Incremental Learning (FSCIL) methods, showing consistent improvements\nacross different settings. To better quantify the trade-off between novel and\nbase class performance, we introduce new metrics: NCR@2FOR and NCR@5FOR. Our\napproach achieves up to a 30% improvement in novel class accuracy on the\nCIFAR100 dataset (1-shot, 1 novel class) while maintaining a controlled base\nclass forgetting rate of 2%.\n","authors":["Kirill Paramonov","Mete Ozay","Eunju Yang","Jijoong Moon","Umberto Michieli"],"pdf_url":"https://arxiv.org/pdf/2501.15998v1.pdf","comment":"ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.15994v1","updated":"2025-01-27T12:29:19Z","published":"2025-01-27T12:29:19Z","title":"Real-Time Brain Tumor Detection in Intraoperative Ultrasound Using\n YOLO11: From Model Training to Deployment in the Operating Room","summary":" Intraoperative ultrasound (ioUS) is a valuable tool in brain tumor surgery\ndue to its versatility, affordability, and seamless integration into the\nsurgical workflow. However, its adoption remains limited, primarily because of\nthe challenges associated with image interpretation and the steep learning\ncurve required for effective use. This study aimed to enhance the\ninterpretability of ioUS images by developing a real-time brain tumor detection\nsystem deployable in the operating room. We collected 2D ioUS images from the\nBrain Tumor Intraoperative Database (BraTioUS) and the public ReMIND dataset,\nannotated with expert-refined tumor labels. Using the YOLO11 architecture and\nits variants, we trained object detection models to identify brain tumors. The\ndataset included 1,732 images from 192 patients, divided into training,\nvalidation, and test sets. Data augmentation expanded the training set to\n11,570 images. In the test dataset, YOLO11s achieved the best balance of\nprecision and computational efficiency, with a mAP@50 of 0.95, mAP@50-95 of\n0.65, and a processing speed of 34.16 frames per second. The proposed solution\nwas prospectively validated in a cohort of 15 consecutively operated patients\ndiagnosed with brain tumors. Neurosurgeons confirmed its seamless integration\ninto the surgical workflow, with real-time predictions accurately delineating\ntumor regions. These findings highlight the potential of real-time object\ndetection algorithms to enhance ioUS-guided brain tumor surgery, addressing key\nchallenges in interpretation and providing a foundation for future development\nof computer vision-based tools for neuro-oncological surgery.\n","authors":["Santiago Cepeda","Olga Esteban-Sinovas","Roberto Romero","Vikas Singh","Prakash Shetty","Aliasgar Moiyadi","Ilyess Zemmoura","Giuseppe Roberto Giammalva","Massimiliano Del Bene","Arianna Barbotti","Francesco DiMeco","Timothy R. West","Brian V. Nahed","Ignacio Arrese","Roberto Hornero","Rosario Sarabia"],"pdf_url":"https://arxiv.org/pdf/2501.15994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15981v1","updated":"2025-01-27T12:08:52Z","published":"2025-01-27T12:08:52Z","title":"MatCLIP: Light- and Shape-Insensitive Assignment of PBR Material Models","summary":" Assigning realistic materials to 3D models remains a significant challenge in\ncomputer graphics. We propose MatCLIP, a novel method that extracts shape- and\nlighting-insensitive descriptors of Physically Based Rendering (PBR) materials\nto assign plausible textures to 3D objects based on images, such as the output\nof Latent Diffusion Models (LDMs) or photographs. Matching PBR materials to\nstatic images is challenging because the PBR representation captures the\ndynamic appearance of materials under varying viewing angles, shapes, and\nlighting conditions. By extending an Alpha-CLIP-based model on material\nrenderings across diverse shapes and lighting, and encoding multiple viewing\nconditions for PBR materials, our approach generates descriptors that bridge\nthe domains of PBR representations with photographs or renderings, including\nLDM outputs. This enables consistent material assignments without requiring\nexplicit knowledge of material relationships between different parts of an\nobject. MatCLIP achieves a top-1 classification accuracy of 76.6%,\noutperforming state-of-the-art methods such as PhotoShape and MatAtlas by over\n15 percentage points on publicly available datasets. Our method can be used to\nconstruct material assignments for 3D shape datasets such as ShapeNet,\n3DCoMPaT++, and Objaverse. All code and data will be released.\n","authors":["Michael Birsak","John Femiani","Biao Zhang","Peter Wonka"],"pdf_url":"https://arxiv.org/pdf/2501.15981v1.pdf","comment":"Preprint, 10 pages"},{"id":"http://arxiv.org/abs/2501.03495v2","updated":"2025-01-27T11:22:55Z","published":"2025-01-07T03:33:22Z","title":"Textualize Visual Prompt for Image Editing via Diffusion Bridge","summary":" Visual prompt, a pair of before-and-after edited images, can convey\nindescribable imagery transformations and prosper in image editing. However,\ncurrent visual prompt methods rely on a pretrained text-guided image-to-image\ngenerative model that requires a triplet of text, before, and after images for\nretraining over a text-to-image model. Such crafting triplets and retraining\nprocesses limit the scalability and generalization of editing. In this paper,\nwe present a framework based on any single text-to-image model without reliance\non the explicit image-to-image model thus enhancing the generalizability and\nscalability. Specifically, by leveraging the probability-flow ordinary\nequation, we construct a diffusion bridge to transfer the distribution between\nbefore-and-after images under the text guidance. By optimizing the text via the\nbridge, the framework adaptively textualizes the editing transformation\nconveyed by visual prompts into text embeddings without other models.\nMeanwhile, we introduce differential attention control during text\noptimization, which disentangles the text embedding from the invariance of the\nbefore-and-after images and makes it solely capture the delicate transformation\nand generalize to edit various images. Experiments on real images validate\ncompetitive results on the generalization, contextual coherence, and high\nfidelity for delicate editing with just one image pair as the visual prompt.\n","authors":["Pengcheng Xu","Qingnan Fan","Fei Kou","Shuai Qin","Hong Gu","Ruoyu Zhao","Charles Ling","Boyu Wang"],"pdf_url":"https://arxiv.org/pdf/2501.03495v2.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2501.15963v1","updated":"2025-01-27T11:14:04Z","published":"2025-01-27T11:14:04Z","title":"Evaluating Data Influence in Meta Learning","summary":" As one of the most fundamental models, meta learning aims to effectively\naddress few-shot learning challenges. However, it still faces significant\nissues related to the training data, such as training inefficiencies due to\nnumerous low-contribution tasks in large datasets and substantial noise from\nincorrect labels. Thus, training data attribution methods are needed for meta\nlearning. However, the dual-layer structure of mata learning complicates the\nmodeling of training data contributions because of the interdependent influence\nbetween meta-parameters and task-specific parameters, making existing data\ninfluence evaluation tools inapplicable or inaccurate. To address these\nchallenges, based on the influence function, we propose a general data\nattribution evaluation framework for meta-learning within the bilevel\noptimization framework. Our approach introduces task influence functions\n(task-IF) and instance influence functions (instance-IF) to accurately assess\nthe impact of specific tasks and individual data points in closed forms. This\nframework comprehensively models data contributions across both the inner and\nouter training processes, capturing the direct effects of data points on\nmeta-parameters as well as their indirect influence through task-specific\nparameters. We also provide several strategies to enhance computational\nefficiency and scalability. Experimental results demonstrate the framework's\neffectiveness in training data evaluation via several downstream tasks.\n","authors":["Chenyang Ren","Huanyi Xie","Shu Yang","Meng Ding","Lijie Hu","Di Wang"],"pdf_url":"https://arxiv.org/pdf/2501.15963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15955v1","updated":"2025-01-27T11:00:19Z","published":"2025-01-27T11:00:19Z","title":"Rethinking the Bias of Foundation Model under Long-tailed Distribution","summary":" Long-tailed learning has garnered increasing attention due to its practical\nsignificance. Among the various approaches, the fine-tuning paradigm has gained\nconsiderable interest with the advent of foundation models. However, most\nexisting methods primarily focus on leveraging knowledge from these models,\noverlooking the inherent biases introduced by the imbalanced training data they\nrely on. In this paper, we examine how such imbalances from pre-training affect\nlong-tailed downstream tasks. Specifically, we find the imbalance biases\ninherited in foundation models on downstream task as parameter imbalance and\ndata imbalance. During fine-tuning, we observe that parameter imbalance plays a\nmore critical role, while data imbalance can be mitigated using existing\nre-balancing strategies. Moreover, we find that parameter imbalance cannot be\neffectively addressed by current re-balancing techniques, such as adjusting the\nlogits, during training, unlike data imbalance. To tackle both imbalances\nsimultaneously, we build our method on causal learning and view the incomplete\nsemantic factor as the confounder, which brings spurious correlations between\ninput samples and labels. To resolve the negative effects of this, we propose a\nnovel backdoor adjustment method that learns the true causal effect between\ninput samples and labels, rather than merely fitting the correlations in the\ndata. Notably, we achieve an average performance increase of about $1.67\\%$ on\neach dataset.\n","authors":["Jiahao Chen","Bin Qin","Jiangmeng Li","Hao Chen","Bing Su"],"pdf_url":"https://arxiv.org/pdf/2501.15955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15953v1","updated":"2025-01-27T10:57:24Z","published":"2025-01-27T10:57:24Z","title":"Understanding Long Videos via LLM-Powered Entity Relation Graphs","summary":" The analysis of extended video content poses unique challenges in artificial\nintelligence, particularly when dealing with the complexity of tracking and\nunderstanding visual elements across time. Current methodologies that process\nvideo frames sequentially struggle to maintain coherent tracking of objects,\nespecially when these objects temporarily vanish and later reappear in the\nfootage. A critical limitation of these approaches is their inability to\neffectively identify crucial moments in the video, largely due to their limited\ngrasp of temporal relationships. To overcome these obstacles, we present\nGraphVideoAgent, a cutting-edge system that leverages the power of graph-based\nobject tracking in conjunction with large language model capabilities. At its\ncore, our framework employs a dynamic graph structure that maps and monitors\nthe evolving relationships between visual entities throughout the video\nsequence. This innovative approach enables more nuanced understanding of how\nobjects interact and transform over time, facilitating improved frame selection\nthrough comprehensive contextual awareness. Our approach demonstrates\nremarkable effectiveness when tested against industry benchmarks. In\nevaluations on the EgoSchema dataset, GraphVideoAgent achieved a 2.2\nimprovement over existing methods while requiring analysis of only 8.2 frames\non average. Similarly, testing on the NExT-QA benchmark yielded a 2.0\nperformance increase with an average frame requirement of 8.1. These results\nunderscore the efficiency of our graph-guided methodology in enhancing both\naccuracy and computational performance in long-form video understanding tasks.\n","authors":["Meng Chu","Yicong Li","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2501.15953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12791v2","updated":"2025-01-27T10:40:20Z","published":"2024-12-17T10:52:50Z","title":"Implicit Location-Caption Alignment via Complementary Masking for\n Weakly-Supervised Dense Video Captioning","summary":" Weakly-Supervised Dense Video Captioning (WSDVC) aims to localize and\ndescribe all events of interest in a video without requiring annotations of\nevent boundaries. This setting poses a great challenge in accurately locating\nthe temporal location of event, as the relevant supervision is unavailable.\nExisting methods rely on explicit alignment constraints between event locations\nand captions, which involve complex event proposal procedures during both\ntraining and inference. To tackle this problem, we propose a novel implicit\nlocation-caption alignment paradigm by complementary masking, which simplifies\nthe complex event proposal and localization process while maintaining\neffectiveness. Specifically, our model comprises two components: a dual-mode\nvideo captioning module and a mask generation module. The dual-mode video\ncaptioning module captures global event information and generates descriptive\ncaptions, while the mask generation module generates differentiable positive\nand negative masks for localizing the events. These masks enable the implicit\nalignment of event locations and captions by ensuring that captions generated\nfrom positively and negatively masked videos are complementary, thereby forming\na complete video description. In this way, even under weak supervision, the\nevent location and event caption can be aligned implicitly. Extensive\nexperiments on the public datasets demonstrate that our method outperforms\nexisting weakly-supervised methods and achieves competitive results compared to\nfully-supervised methods.\n","authors":["Shiping Ge","Qiang Chen","Zhiwei Jiang","Yafeng Yin","Liu Qin","Ziyao Chen","Qing Gu"],"pdf_url":"https://arxiv.org/pdf/2412.12791v2.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.08083v2","updated":"2025-01-27T10:33:21Z","published":"2025-01-14T12:51:34Z","title":"Benchmarking Vision Foundation Models for Input Monitoring in Autonomous\n Driving","summary":" Deep neural networks (DNNs) remain challenged by distribution shifts in\ncomplex open-world domains like automated driving (AD): Absolute robustness\nagainst yet unknown novel objects (semantic shift) or styles like lighting\nconditions (covariate shift) cannot be guaranteed. Hence, reliable\noperation-time monitors for identification of out-of-training-data-distribution\n(OOD) scenarios are imperative. Current approaches for OOD classification are\nuntested for complex domains like AD, are limited in the kinds of shifts they\ndetect, or even require supervision with OOD samples. To prepare for\nunanticipated shifts, we instead establish a framework around a principled,\nunsupervised, and model-agnostic method that unifies detection of all kinds of\nshifts: Find a full model of the training data's feature distribution, to then\nuse its density at new points as in-distribution (ID) score. To implement this,\nwe propose to combine the newly available Vision Foundation Models (VFM) as\nfeature extractors with one of four alternative density modeling techniques. In\nan extensive benchmark of 4 VFMs against 20 baselines, we show the superior\nperformance of VFM feature encodings compared to shift-specific OOD monitors.\nAdditionally, we find that sophisticated architectures outperform larger latent\nspace dimensionality; and our method identifies samples with higher risk of\nerrors on downstream tasks, despite being model-agnostic. This suggests that\nVFMs are promising to realize model-agnostic, unsupervised, reliable safety\nmonitors in complex vision tasks.\n","authors":["Mert Keser","Halil Ibrahim Orhan","Niki Amini-Naieni","Gesina Schwalbe","Alois Knoll","Matthias Rottmann"],"pdf_url":"https://arxiv.org/pdf/2501.08083v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15891v1","updated":"2025-01-27T09:33:23Z","published":"2025-01-27T09:33:23Z","title":"Any2AnyTryon: Leveraging Adaptive Position Embeddings for Versatile\n Virtual Clothing Tasks","summary":" Image-based virtual try-on (VTON) aims to generate a virtual try-on result by\ntransferring an input garment onto a target person's image. However, the\nscarcity of paired garment-model data makes it challenging for existing methods\nto achieve high generalization and quality in VTON. Also, it limits the ability\nto generate mask-free try-ons. To tackle the data scarcity problem, approaches\nsuch as Stable Garment and MMTryon use a synthetic data strategy, effectively\nincreasing the amount of paired data on the model side. However, existing\nmethods are typically limited to performing specific try-on tasks and lack\nuser-friendliness. To enhance the generalization and controllability of VTON\ngeneration, we propose Any2AnyTryon, which can generate try-on results based on\ndifferent textual instructions and model garment images to meet various needs,\neliminating the reliance on masks, poses, or other conditions. Specifically, we\nfirst construct the virtual try-on dataset LAION-Garment, the largest known\nopen-source garment try-on dataset. Then, we introduce adaptive position\nembedding, which enables the model to generate satisfactory outfitted model\nimages or garment images based on input images of different sizes and\ncategories, significantly enhancing the generalization and controllability of\nVTON generation. In our experiments, we demonstrate the effectiveness of our\nAny2AnyTryon and compare it with existing methods. The results show that\nAny2AnyTryon enables flexible, controllable, and high-quality image-based\nvirtual try-on generation.https://logn-2024.github.io/Any2anyTryonProjectPage/\n","authors":["Hailong Guo","Bohan Zeng","Yiren Song","Wentao Zhang","Chuang Zhang","Jiaming Liu"],"pdf_url":"https://arxiv.org/pdf/2501.15891v1.pdf","comment":"13 pages,13 figures"},{"id":"http://arxiv.org/abs/2501.15890v1","updated":"2025-01-27T09:32:56Z","published":"2025-01-27T09:32:56Z","title":"A Data-Centric Approach: Dimensions of Visual Complexity and How to find\n Them","summary":" Understanding how humans perceive visual complexity is a key area of study in\nvisual cognition. Previous approaches to modeling visual complexity have often\nresulted in intricate, difficult-to-interpret solutions that employ numerous\nfeatures or sophisticated deep learning architectures. While these complex\nmodels achieve high performance on specific datasets, they often sacrifice\ninterpretability, making it challenging to understand the factors driving human\nperception of complexity. A recent model based on image segmentations showed\npromise in addressing this challenge; however, it presented limitations in\ncapturing structural and semantic aspects of visual complexity. In this paper,\nwe propose viable and effective features to overcome these shortcomings.\nSpecifically, we develop multiscale features for the structural aspect of\ncomplexity, including the Multiscale Sobel Gradient (MSG), which captures\nspatial intensity variations across scales, and Multiscale Unique Colors (MUC),\nwhich quantifies image colorfulness by indexing quantized RGB values. We also\nintroduce a new dataset SVG based on Visual Genome to explore the semantic\naspect of visual complexity, obtaining surprise scores based on the element of\nsurprise in images, which we demonstrate significantly contributes to perceived\ncomplexity. Overall, we suggest that the nature of the data is fundamental to\nunderstanding and modeling visual complexity, highlighting the importance of\nboth structural and semantic dimensions in providing a comprehensive,\ninterpretable assessment. The code for our analysis, experimental setup, and\ndataset will be made publicly available upon acceptance.\n","authors":["Karahan Sarıtaş","Tingke Shen","Surabhi S Nath","Peter Dayan"],"pdf_url":"https://arxiv.org/pdf/2501.15890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15628v3","updated":"2025-01-27T09:09:28Z","published":"2024-10-21T04:24:10Z","title":"Towards Kriging-informed Conditional Diffusion for Regional Sea-Level\n Data Downscaling","summary":" Given coarser-resolution projections from global climate models or satellite\ndata, the downscaling problem aims to estimate finer-resolution regional\nclimate data, capturing fine-scale spatial patterns and variability.\nDownscaling is any method to derive high-resolution data from low-resolution\nvariables, often to provide more detailed and local predictions and analyses.\nThis problem is societally crucial for effective adaptation, mitigation, and\nresilience against significant risks from climate change. The challenge arises\nfrom spatial heterogeneity and the need to recover finer-scale features while\nensuring model generalization. Most downscaling methods \\cite{Li2020} fail to\ncapture the spatial dependencies at finer scales and underperform on real-world\nclimate datasets, such as sea-level rise. We propose a novel Kriging-informed\nConditional Diffusion Probabilistic Model (Ki-CDPM) to capture spatial\nvariability while preserving fine-scale features. Experimental results on\nclimate data show that our proposed method is more accurate than\nstate-of-the-art downscaling techniques.\n","authors":["Subhankar Ghosh","Arun Sharma","Jayant Gupta","Aneesh Subramanian","Shashi Shekhar"],"pdf_url":"https://arxiv.org/pdf/2410.15628v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15878v1","updated":"2025-01-27T09:03:34Z","published":"2025-01-27T09:03:34Z","title":"Slot-Guided Adaptation of Pre-trained Diffusion Models for\n Object-Centric Learning and Compositional Generation","summary":" We present SlotAdapt, an object-centric learning method that combines slot\nattention with pretrained diffusion models by introducing adapters for\nslot-based conditioning. Our method preserves the generative power of\npretrained diffusion models, while avoiding their text-centric conditioning\nbias. We also incorporate an additional guidance loss into our architecture to\nalign cross-attention from adapter layers with slot attention. This enhances\nthe alignment of our model with the objects in the input image without using\nexternal supervision. Experimental results show that our method outperforms\nstate-of-the-art techniques in object discovery and image generation tasks\nacross multiple datasets, including those with real images. Furthermore, we\ndemonstrate through experiments that our method performs remarkably well on\ncomplex real-world images for compositional generation, in contrast to other\nslot-based generative methods in the literature. The project page can be found\nat $\\href{https://kaanakan.github.io/SlotAdapt/}{\\text{this https url}}$.\n","authors":["Adil Kaan Akan","Yucel Yemez"],"pdf_url":"https://arxiv.org/pdf/2501.15878v1.pdf","comment":"Accepted to ICLR2025.\n $\\href{https://kaanakan.github.io/SlotAdapt/}{\\text{Project Page}}$"},{"id":"http://arxiv.org/abs/2501.15870v1","updated":"2025-01-27T08:46:22Z","published":"2025-01-27T08:46:22Z","title":"D-PLS: Decoupled Semantic Segmentation for\n 4D-Panoptic-LiDAR-Segmentation","summary":" This paper introduces a novel approach to 4D Panoptic LiDAR Segmentation that\ndecouples semantic and instance segmentation, leveraging single-scan semantic\npredictions as prior information for instance segmentation. Our method D-PLS\nfirst performs single-scan semantic segmentation and aggregates the results\nover time, using them to guide instance segmentation. The modular design of\nD-PLS allows for seamless integration on top of any semantic segmentation\narchitecture, without requiring architectural changes or retraining. We\nevaluate our approach on the SemanticKITTI dataset, where it demonstrates\nsignificant improvements over the baseline in both classification and\nassociation tasks, as measured by the LiDAR Segmentation and Tracking Quality\n(LSTQ) metric. Furthermore, we show that our decoupled architecture not only\nenhances instance prediction but also surpasses the baseline due to\nadvancements in single-scan semantic segmentation.\n","authors":["Maik Steinhauser","Laurenz Reichardt","Nikolas Ebert","Oliver Wasenmüller"],"pdf_url":"https://arxiv.org/pdf/2501.15870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15860v1","updated":"2025-01-27T08:36:14Z","published":"2025-01-27T08:36:14Z","title":"The Components of Collaborative Joint Perception and Prediction -- A\n Conceptual Framework","summary":" Connected Autonomous Vehicles (CAVs) benefit from Vehicle-to-Everything (V2X)\ncommunication, which enables the exchange of sensor data to achieve\nCollaborative Perception (CP). To reduce cumulative errors in perception\nmodules and mitigate the visual occlusion, this paper introduces a new task,\nCollaborative Joint Perception and Prediction (Co-P&P), and provides a\nconceptual framework for its implementation to improve motion prediction of\nsurrounding objects, thereby enhancing vehicle awareness in complex traffic\nscenarios. The framework consists of two decoupled core modules, Collaborative\nScene Completion (CSC) and Joint Perception and Prediction (P&P) module, which\nsimplify practical deployment and enhance scalability. Additionally, we outline\nthe challenges in Co-P&P and discuss future directions for this research area.\n","authors":["Lei Wan","Hannan Ejaz Keen","Alexey Vinel"],"pdf_url":"https://arxiv.org/pdf/2501.15860v1.pdf","comment":"8 pages, 4 figures, accepted by conference VEHITS2025"},{"id":"http://arxiv.org/abs/2501.15852v1","updated":"2025-01-27T08:19:17Z","published":"2025-01-27T08:19:17Z","title":"CausalSR: Structural Causal Model-Driven Super-Resolution with\n Counterfactual Inference","summary":" Physical and optical factors interacting with sensor characteristics create\ncomplex image degradation patterns. Despite advances in deep learning-based\nsuper-resolution, existing methods overlook the causal nature of degradation by\nadopting simplistic black-box mappings. This paper formulates super-resolution\nusing structural causal models to reason about image degradation processes. We\nestablish a mathematical foundation that unifies principles from causal\ninference, deriving necessary conditions for identifying latent degradation\nmechanisms and corresponding propagation. We propose a novel counterfactual\nlearning strategy that leverages semantic guidance to reason about hypothetical\ndegradation scenarios, leading to theoretically-grounded representations that\ncapture invariant features across different degradation conditions. The\nframework incorporates an adaptive intervention mechanism with provable bounds\non treatment effects, allowing precise manipulation of degradation factors\nwhile maintaining semantic consistency. Through extensive empirical validation,\nwe demonstrate that our approach achieves significant improvements over\nstate-of-the-art methods, particularly in challenging scenarios with compound\ndegradations. On standard benchmarks, our method consistently outperforms\nexisting approaches by significant margins (0.86-1.21dB PSNR), while providing\ninterpretable insights into the restoration process. The theoretical framework\nand empirical results demonstrate the fundamental importance of causal\nreasoning in understanding image restoration systems.\n","authors":["Zhengyang Lu","Bingjie Lu","Feng Wang"],"pdf_url":"https://arxiv.org/pdf/2501.15852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15847v1","updated":"2025-01-27T08:16:54Z","published":"2025-01-27T08:16:54Z","title":"Can Location Embeddings Enhance Super-Resolution of Satellite Imagery?","summary":" Publicly available satellite imagery, such as Sentinel- 2, often lacks the\nspatial resolution required for accurate analysis of remote sensing tasks\nincluding urban planning and disaster response. Current super-resolution\ntechniques are typically trained on limited datasets, leading to poor\ngeneralization across diverse geographic regions. In this work, we propose a\nnovel super-resolution framework that enhances generalization by incorporating\ngeographic context through location embeddings. Our framework employs\nGenerative Adversarial Networks (GANs) and incorporates techniques from\ndiffusion models to enhance image quality. Furthermore, we address tiling\nartifacts by integrating information from neighboring images, enabling the\ngeneration of seamless, high-resolution outputs. We demonstrate the\neffectiveness of our method on the building segmentation task, showing\nsignificant improvements over state-of-the-art methods and highlighting its\npotential for real-world applications.\n","authors":["Daniel Panangian","Ksenia Bittner"],"pdf_url":"https://arxiv.org/pdf/2501.15847v1.pdf","comment":"Accepted to IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV)"},{"id":"http://arxiv.org/abs/2412.19055v2","updated":"2025-01-27T08:04:15Z","published":"2024-12-26T04:45:05Z","title":"SpectralKD: A Unified Framework for Interpreting and Distilling Vision\n Transformers via Spectral Analysis","summary":" Knowledge Distillation (KD) has achieved widespread success in compressing\nlarge Vision Transformers (ViTs), but a unified theoretical framework for both\nViTs and KD is still lacking. In this paper, we propose SpectralKD, a novel\nunified analytical framework that offers deeper insights into ViTs and\noptimizes KD via spectral analysis. Our model-wise analysis reveals that CaiT\nconcentrates information in their first and last few layers, informing optimal\nlayer selection for KD. Surprisingly, our layer-wise analysis discovers that\nSwin Transformer and CaiT exhibit similar spectral encoding patterns despite\ntheir architectural differences, leading to feature map alignment guideline.\nBuilding on these insights, we propose a simple yet effective spectral\nalignment method for KD. Benefiting from the deeper understanding by above\nanalysis results, even such a simple strategy achieves state-of-the-art\nperformance on ImageNet-1K without introducing any trainable parameters,\nimproving DeiT-Tiny by $+5.2\\%$ and Swin-Tiny by $+1.4\\%$ in top-1 accuracy.\nFurthermore, our post-training analysis reveals that distilled students can\nreproduce spectral patterns similar to their teachers, opening a new area we\nterm ``distillation dynamics\". Code and experimental logs are available in\nhttps://github.com/thy960112/SpectralKD.\n","authors":["Huiyuan Tian","Bonan Xu","Shijian Li","Gang Pan"],"pdf_url":"https://arxiv.org/pdf/2412.19055v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15839v1","updated":"2025-01-27T08:00:12Z","published":"2025-01-27T08:00:12Z","title":"Controllable Hand Grasp Generation for HOI and Efficient Evaluation\n Methods","summary":" Controllable affordance Hand-Object Interaction (HOI) generation has become\nan increasingly important area of research in computer vision. In HOI\ngeneration, the hand grasp generation is a crucial step for effectively\ncontrolling the geometry of the hand. Current hand grasp generation methods\nrely on 3D information for both the hand and the object. In addition, these\nmethods lack controllability concerning the hand's location and orientation. We\ntreat the hand pose as the discrete graph structure and exploit the geometric\npriors. It is well established that higher order contextual dependency among\nthe points improves the quality of the results in general. We propose a\nframework of higher order geometric representations (HOR's) inspired by\nspectral graph theory and vector algebra to improve the quality of generated\nhand poses. We demonstrate the effectiveness of our proposed HOR's in devising\na controllable novel diffusion method (based on 2D information) for hand grasp\ngeneration that outperforms the state of the art (SOTA). Overcoming the\nlimitations of existing methods: like lacking of controllability and dependency\non 3D information. Once we have the generated pose, it is very natural to\nevaluate them using a metric. Popular metrics like FID and MMD are biased and\ninefficient for evaluating the generated hand poses. Using our proposed HOR's,\nwe introduce an efficient and stable framework of evaluation metrics for grasp\ngeneration methods, addressing inefficiencies and biases in FID and MMD.\n","authors":[" Ishant","Rongliang Wu","Joo Hwee Lim"],"pdf_url":"https://arxiv.org/pdf/2501.15839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13967v2","updated":"2025-01-27T07:48:49Z","published":"2025-01-22T07:08:45Z","title":"FedDAG: Federated Domain Adversarial Generation Towards Generalizable\n Medical Image Analysis","summary":" Federated domain generalization aims to train a global model from multiple\nsource domains and ensure its generalization ability to unseen target domains.\nDue to the target domain being with unknown domain shifts, attempting to\napproximate these gaps by source domains may be the key to improving model\ngeneralization capability. Existing works mainly focus on sharing and\nrecombining local domain-specific attributes to increase data diversity and\nsimulate potential domain shifts. However, these methods may be insufficient\nsince only the local attribute recombination can be hard to touch the\nout-of-distribution of global data. In this paper, we propose a\nsimple-yet-efficient framework named Federated Domain Adversarial Generation\n(FedDAG). It aims to simulate the domain shift and improve the model\ngeneralization by adversarially generating novel domains different from local\nand global source domains. Specifically, it generates novel-style images by\nmaximizing the instance-level feature discrepancy between original and\ngenerated images and trains a generalizable task model by minimizing their\nfeature discrepancy. Further, we observed that FedDAG could cause different\nperformance improvements for local models. It may be due to inherent data\nisolation and heterogeneity among clients, exacerbating the imbalance in their\ngeneralization contributions to the global model. Ignoring this imbalance can\nlead the global model's generalization ability to be sub-optimal, further\nlimiting the novel domain generation procedure. Thus, to mitigate this\nimbalance, FedDAG hierarchically aggregates local models at the within-client\nand across-client levels by using the sharpness concept to evaluate client\nmodel generalization contributions. Extensive experiments across four medical\nbenchmarks demonstrate FedDAG's ability to enhance generalization in federated\nmedical scenarios.\n","authors":["Haoxuan Che","Yifei Wu","Haibo Jin","Yong Xia","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2501.13967v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15831v1","updated":"2025-01-27T07:37:37Z","published":"2025-01-27T07:37:37Z","title":"Pfungst and Clever Hans: Identifying the unintended cues in a widely\n used Alzheimer's disease MRI dataset using explainable deep learning","summary":" Backgrounds.\n Deep neural networks have demonstrated high accuracy in classifying\nAlzheimer's disease (AD). This study aims to enlighten the underlying black-box\nnature and reveal individual contributions of T1-weighted (T1w) gray-white\nmatter texture, volumetric information and preprocessing on classification\nperformance.\n Methods.\n We utilized T1w MRI data from the Alzheimer's Disease Neuroimaging Initiative\nto distinguish matched AD patients (990 MRIs) from healthy controls (990 MRIs).\nPreprocessing included skull stripping and binarization at varying thresholds\nto systematically eliminate texture information. A deep neural network was\ntrained on these configurations, and the model performance was compared using\nMcNemar tests with discrete Bonferroni-Holm correction. Layer-wise Relevance\nPropagation (LRP) and structural similarity metrics between heatmaps were\napplied to analyze learned features.\n Results.\n Classification performance metrics (accuracy, sensitivity, and specificity)\nwere comparable across all configurations, indicating a negligible influence of\nT1w gray- and white signal texture. Models trained on binarized images\ndemonstrated similar feature performance and relevance distributions, with\nvolumetric features such as atrophy and skull-stripping features emerging as\nprimary contributors.\n Conclusions.\n We revealed a previously undiscovered Clever Hans effect in a widely used AD\nMRI dataset. Deep neural networks classification predominantly rely on\nvolumetric features, while eliminating gray-white matter T1w texture did not\ndecrease the performance. This study clearly demonstrates an overestimation of\nthe importance of gray-white matter contrasts, at least for widely used\nstructural T1w images, and highlights potential misinterpretation of\nperformance metrics.\n","authors":["Christian Tinauer","Maximilian Sackl","Rudolf Stollberger","Stefan Ropele","Christian Langkammer"],"pdf_url":"https://arxiv.org/pdf/2501.15831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15808v1","updated":"2025-01-27T06:28:45Z","published":"2025-01-27T06:28:45Z","title":"ClearSight: Human Vision-Inspired Solutions for Event-Based Motion\n Deblurring","summary":" Motion deblurring addresses the challenge of image blur caused by camera or\nscene movement. Event cameras provide motion information that is encoded in the\nasynchronous event streams. To efficiently leverage the temporal information of\nevent streams, we employ Spiking Neural Networks (SNNs) for motion feature\nextraction and Artificial Neural Networks (ANNs) for color information\nprocessing. Due to the non-uniform distribution and inherent redundancy of\nevent data, existing cross-modal feature fusion methods exhibit certain\nlimitations. Inspired by the visual attention mechanism in the human visual\nsystem, this study introduces a bioinspired dual-drive hybrid network (BDHNet).\nSpecifically, the Neuron Configurator Module (NCM) is designed to dynamically\nadjusts neuron configurations based on cross-modal features, thereby focusing\nthe spikes in blurry regions and adapting to varying blurry scenarios\ndynamically. Additionally, the Region of Blurry Attention Module (RBAM) is\nintroduced to generate a blurry mask in an unsupervised manner, effectively\nextracting motion clues from the event features and guiding more accurate\ncross-modal feature fusion. Extensive subjective and objective evaluations\ndemonstrate that our method outperforms current state-of-the-art methods on\nboth synthetic and real-world datasets.\n","authors":["Xiaopeng Lin","Yulong Huang","Hongwei Ren","Zunchang Liu","Yue Zhou","Haotian Fu","Bojun Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.15808v1.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2406.11775v2","updated":"2025-01-27T06:25:11Z","published":"2024-06-17T17:32:42Z","title":"Task Me Anything","summary":" Benchmarks for large multimodal language models (MLMs) now serve to\nsimultaneously assess the general capabilities of models instead of evaluating\nfor a specific capability. As a result, when a developer wants to identify\nwhich models to use for their application, they are overwhelmed by the number\nof benchmarks and remain uncertain about which benchmark's results are most\nreflective of their specific use case. This paper introduces Task-Me-Anything,\na benchmark generation engine which produces a benchmark tailored to a user's\nneeds. Task-Me-Anything maintains an extendable taxonomy of visual assets and\ncan programmatically generate a vast number of task instances. Additionally, it\nalgorithmically addresses user queries regarding MLM performance efficiently\nwithin a computational budget. It contains 113K images, 10K videos, 2K 3D\nobject assets, over 365 object categories, 655 attributes, and 335\nrelationships. It can generate 750M image/video question-answering pairs, which\nfocus on evaluating MLM perceptual capabilities. Task-Me-Anything reveals\ncritical insights: open-source MLMs excel in object and attribute recognition\nbut lack spatial and temporal understanding; each model exhibits unique\nstrengths and weaknesses; larger models generally perform better, though\nexceptions exist; and GPT4o demonstrates challenges in recognizing\nrotating/moving objects and distinguishing colors.\n","authors":["Jieyu Zhang","Weikai Huang","Zixian Ma","Oscar Michel","Dong He","Tanmay Gupta","Wei-Chiu Ma","Ali Farhadi","Aniruddha Kembhavi","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2406.11775v2.pdf","comment":"NeurIPS 2024 Track on Datasets and Benchmarks. Website:\n https://www.task-me-anything.org"},{"id":"http://arxiv.org/abs/2501.15798v1","updated":"2025-01-27T05:49:06Z","published":"2025-01-27T05:49:06Z","title":"MM-Retinal V2: Transfer an Elite Knowledge Spark into Fundus\n Vision-Language Pretraining","summary":" Vision-language pretraining (VLP) has been investigated to generalize across\ndiverse downstream tasks for fundus image analysis. Although recent methods\nshowcase promising achievements, they significantly rely on large-scale private\nimage-text data but pay less attention to the pretraining manner, which limits\ntheir further advancements. In this work, we introduce MM-Retinal V2, a\nhigh-quality image-text paired dataset comprising CFP, FFA, and OCT image\nmodalities. Then, we propose a novel fundus vision-language pretraining model,\nnamely KeepFIT V2, which is pretrained by integrating knowledge from the elite\ndata spark into categorical public datasets. Specifically, a preliminary\ntextual pretraining is adopted to equip the text encoder with primarily\nophthalmic textual knowledge. Moreover, a hybrid image-text knowledge injection\nmodule is designed for knowledge transfer, which is essentially based on a\ncombination of global semantic concepts from contrastive learning and local\nappearance details from generative learning. Extensive experiments across\nzero-shot, few-shot, and linear probing settings highlight the generalization\nand transferability of KeepFIT V2, delivering performance competitive to\nstate-of-the-art fundus VLP models trained on large-scale private image-text\ndatasets. Our dataset and model are publicly available via\nhttps://github.com/lxirich/MM-Retinal.\n","authors":["Ruiqi Wu","Na Su","Chenran Zhang","Tengfei Ma","Tao Zhou","Zhiting Cui","Nianfeng Tang","Tianyu Mao","Yi Zhou","Wen Fan","Tianxing Wu","Shenqi Jing","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2501.15798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07766v2","updated":"2025-01-27T05:48:05Z","published":"2024-12-10T18:58:29Z","title":"Make-A-Texture: Fast Shape-Aware Texture Generation in 3 Seconds","summary":" We present Make-A-Texture, a new framework that efficiently synthesizes\nhigh-resolution texture maps from textual prompts for given 3D geometries. Our\napproach progressively generates textures that are consistent across multiple\nviewpoints with a depth-aware inpainting diffusion model, in an optimized\nsequence of viewpoints determined by an automatic view selection algorithm.\n A significant feature of our method is its remarkable efficiency, achieving a\nfull texture generation within an end-to-end runtime of just 3.07 seconds on a\nsingle NVIDIA H100 GPU, significantly outperforming existing methods. Such an\nacceleration is achieved by optimizations in the diffusion model and a\nspecialized backprojection method. Moreover, our method reduces the artifacts\nin the backprojection phase, by selectively masking out non-frontal faces, and\ninternal faces of open-surfaced objects.\n Experimental results demonstrate that Make-A-Texture matches or exceeds the\nquality of other state-of-the-art methods. Our work significantly improves the\napplicability and practicality of texture generation models for real-world 3D\ncontent creation, including interactive creation and text-guided texture\nediting.\n","authors":["Xiaoyu Xiang","Liat Sless Gorelik","Yuchen Fan","Omri Armstrong","Forrest Iandola","Yilei Li","Ita Lifshitz","Rakesh Ranjan"],"pdf_url":"https://arxiv.org/pdf/2412.07766v2.pdf","comment":"Accepted to WACV 2025 Webpage:\n https://mukosame.github.io/make-a-texture/ Video:\n https://www.youtube.com/watch?v=2Ctqdx1uaj0"},{"id":"http://arxiv.org/abs/2501.15795v1","updated":"2025-01-27T05:41:10Z","published":"2025-01-27T05:41:10Z","title":"Can Multimodal Large Language Models be Guided to Improve Industrial\n Anomaly Detection?","summary":" In industrial settings, the accurate detection of anomalies is essential for\nmaintaining product quality and ensuring operational safety. Traditional\nindustrial anomaly detection (IAD) models often struggle with flexibility and\nadaptability, especially in dynamic production environments where new defect\ntypes and operational changes frequently arise. Recent advancements in\nMultimodal Large Language Models (MLLMs) hold promise for overcoming these\nlimitations by combining visual and textual information processing\ncapabilities. MLLMs excel in general visual understanding due to their training\non large, diverse datasets, but they lack domain-specific knowledge, such as\nindustry-specific defect tolerance levels, which limits their effectiveness in\nIAD tasks. To address these challenges, we propose Echo, a novel multi-expert\nframework designed to enhance MLLM performance for IAD. Echo integrates four\nexpert modules: Reference Extractor which provides a contextual baseline by\nretrieving similar normal images, Knowledge Guide which supplies\ndomain-specific insights, Reasoning Expert which enables structured, stepwise\nreasoning for complex queries, and Decision Maker which synthesizes information\nfrom all modules to deliver precise, context-aware responses. Evaluated on the\nMMAD benchmark, Echo demonstrates significant improvements in adaptability,\nprecision, and robustness, moving closer to meeting the demands of real-world\nindustrial anomaly detection.\n","authors":["Zhiling Chen","Hanning Chen","Mohsen Imani","Farhad Imani"],"pdf_url":"https://arxiv.org/pdf/2501.15795v1.pdf","comment":"16 pages, 11 figures"},{"id":"http://arxiv.org/abs/2501.01003v2","updated":"2025-01-27T04:59:34Z","published":"2025-01-02T01:56:58Z","title":"EasySplat: View-Adaptive Learning makes 3D Gaussian Splatting Easy","summary":" 3D Gaussian Splatting (3DGS) techniques have achieved satisfactory 3D scene\nrepresentation. Despite their impressive performance, they confront challenges\ndue to the limitation of structure-from-motion (SfM) methods on acquiring\naccurate scene initialization, or the inefficiency of densification strategy.\nIn this paper, we introduce a novel framework EasySplat to achieve high-quality\n3DGS modeling. Instead of using SfM for scene initialization, we employ a novel\nmethod to release the power of large-scale pointmap approaches. Specifically,\nwe propose an efficient grouping strategy based on view similarity, and use\nrobust pointmap priors to obtain high-quality point clouds and camera poses for\n3D scene initialization. After obtaining a reliable scene structure, we propose\na novel densification approach that adaptively splits Gaussian primitives based\non the average shape of neighboring Gaussian ellipsoids, utilizing KNN scheme.\nIn this way, the proposed method tackles the limitation on initialization and\noptimization, leading to an efficient and accurate 3DGS modeling. Extensive\nexperiments demonstrate that EasySplat outperforms the current state-of-the-art\n(SOTA) in handling novel view synthesis.\n","authors":["Ao Gao","Luosong Guo","Tao Chen","Zhao Wang","Ying Tai","Jian Yang","Zhenyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.01003v2.pdf","comment":"6 pages, 5figures"},{"id":"http://arxiv.org/abs/2501.15775v1","updated":"2025-01-27T04:47:19Z","published":"2025-01-27T04:47:19Z","title":"Do Existing Testing Tools Really Uncover Gender Bias in Text-to-Image\n Models?","summary":" Text-to-Image (T2I) models have recently gained significant attention due to\ntheir ability to generate high-quality images and are consequently used in a\nwide range of applications. However, there are concerns about the gender bias\nof these models. Previous studies have shown that T2I models can perpetuate or\neven amplify gender stereotypes when provided with neutral text prompts.\nResearchers have proposed automated gender bias uncovering detectors for T2I\nmodels, but a crucial gap exists: no existing work comprehensively compares the\nvarious detectors and understands how the gender bias detected by them deviates\nfrom the actual situation. This study addresses this gap by validating previous\ngender bias detectors using a manually labeled dataset and comparing how the\nbias identified by various detectors deviates from the actual bias in T2I\nmodels, as verified by manual confirmation. We create a dataset consisting of\n6,000 images generated from three cutting-edge T2I models: Stable Diffusion XL,\nStable Diffusion 3, and Dreamlike Photoreal 2.0. During the human-labeling\nprocess, we find that all three T2I models generate a portion (12.48% on\naverage) of low-quality images (e.g., generate images with no face present),\nwhere human annotators cannot determine the gender of the person. Our analysis\nreveals that all three T2I models show a preference for generating male images,\nwith SDXL being the most biased. Additionally, images generated using prompts\ncontaining professional descriptions (e.g., lawyer or doctor) show the most\nbias. We evaluate seven gender bias detectors and find that none fully capture\nthe actual level of bias in T2I models, with some detectors overestimating bias\nby up to 26.95%. We further investigate the causes of inaccurate estimations,\nhighlighting the limitations of detectors in dealing with low-quality images.\nBased on our findings, we propose an enhanced detector...\n","authors":["Yunbo Lyu","Zhou Yang","Yuqing Niu","Jing Jiang","David Lo"],"pdf_url":"https://arxiv.org/pdf/2501.15775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15774v1","updated":"2025-01-27T04:46:58Z","published":"2025-01-27T04:46:58Z","title":"Efficient Attention-Sharing Information Distillation Transformer for\n Lightweight Single Image Super-Resolution","summary":" Transformer-based Super-Resolution (SR) methods have demonstrated superior\nperformance compared to convolutional neural network (CNN)-based SR approaches\ndue to their capability to capture long-range dependencies. However, their high\ncomputational complexity necessitates the development of lightweight approaches\nfor practical use. To address this challenge, we propose the Attention-Sharing\nInformation Distillation (ASID) network, a lightweight SR network that\nintegrates attention-sharing and an information distillation structure\nspecifically designed for Transformer-based SR methods. We modify the\ninformation distillation scheme, originally designed for efficient CNN\noperations, to reduce the computational load of stacked self-attention layers,\neffectively addressing the efficiency bottleneck. Additionally, we introduce\nattention-sharing across blocks to further minimize the computational cost of\nself-attention operations. By combining these strategies, ASID achieves\ncompetitive performance with existing SR methods while requiring only around\n300K parameters - significantly fewer than existing CNN-based and\nTransformer-based SR models. Furthermore, ASID outperforms state-of-the-art SR\nmethods when the number of parameters is matched, demonstrating its efficiency\nand effectiveness. The code and supplementary material are available on the\nproject page.\n","authors":["Karam Park","Jae Woong Soh","Nam Ik Cho"],"pdf_url":"https://arxiv.org/pdf/2501.15774v1.pdf","comment":"Published at AAAI 2025, for project page, see\n https://github.com/saturnian77/ASID"},{"id":"http://arxiv.org/abs/2501.15763v1","updated":"2025-01-27T04:16:42Z","published":"2025-01-27T04:16:42Z","title":"NanoHTNet: Nano Human Topology Network for Efficient 3D Human Pose\n Estimation","summary":" The widespread application of 3D human pose estimation (HPE) is limited by\nresource-constrained edge devices, requiring more efficient models. A key\napproach to enhancing efficiency involves designing networks based on the\nstructural characteristics of input data. However, effectively utilizing the\nstructural priors in human skeletal inputs remains challenging. To address\nthis, we leverage both explicit and implicit spatio-temporal priors of the\nhuman body through innovative model design and a pre-training proxy task.\nFirst, we propose a Nano Human Topology Network (NanoHTNet), a tiny 3D HPE\nnetwork with stacked Hierarchical Mixers to capture explicit features.\nSpecifically, the spatial Hierarchical Mixer efficiently learns the human\nphysical topology across multiple semantic levels, while the temporal\nHierarchical Mixer with discrete cosine transform and low-pass filtering\ncaptures local instantaneous movements and global action coherence. Moreover,\nEfficient Temporal-Spatial Tokenization (ETST) is introduced to enhance\nspatio-temporal interaction and reduce computational complexity significantly.\nSecond, PoseCLR is proposed as a general pre-training method based on\ncontrastive learning for 3D HPE, aimed at extracting implicit representations\nof human topology. By aligning 2D poses from diverse viewpoints in the proxy\ntask, PoseCLR aids 3D HPE encoders like NanoHTNet in more effectively capturing\nthe high-dimensional features of the human body, leading to further performance\nimprovements. Extensive experiments verify that NanoHTNet with PoseCLR\noutperforms other state-of-the-art methods in efficiency, making it ideal for\ndeployment on edge devices like the Jetson Nano. Code and models are available\nat https://github.com/vefalun/NanoHTNet.\n","authors":["Jialun Cai","Mengyuan Liu","Hong Liu","Wenhao Li","Shuheng Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.15763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12114v3","updated":"2025-01-27T04:02:25Z","published":"2024-05-20T15:29:26Z","title":"A New Cross-Space Total Variation Regularization Model for Color Image\n Restoration with Quaternion Blur Operator","summary":" The cross-channel deblurring problem in color image processing is difficult\nto solve due to the complex coupling and structural blurring of color pixels.\nUntil now, there are few efficient algorithms that can reduce color artifacts\nin deblurring process. To solve this challenging problem, we present a novel\ncross-space total variation (CSTV) regularization model for color image\ndeblurring by introducing a quaternion blur operator and a cross-color space\nregularization functional. The existence and uniqueness of the solution are\nproved and a new L-curve method is proposed to find a balance of regularization\nterms on different color spaces. The Euler-Lagrange equation is derived to show\nthat CSTV has taken into account the coupling of all color channels and the\nlocal smoothing within each color channel. A quaternion operator splitting\nmethod is firstly proposed to enhance the ability of color artifacts reduction\nof the CSTV regularization model. This strategy also applies to the well-known\ncolor deblurring models. Numerical experiments on color image databases\nillustrate the efficiency and effectiveness of the new model and algorithms.\nThe color images restored by them successfully maintain the color and spatial\ninformation and are of higher quality in terms of PSNR, SSIM, MSE and CIEde2000\nthan the restorations of the-state-of-the-art methods.\n","authors":["Zhigang Jia","Yuelian Xiang","Meixiang Zhao","Tingting Wu","Michael K. Ng"],"pdf_url":"https://arxiv.org/pdf/2405.12114v3.pdf","comment":"15pages,14figures"},{"id":"http://arxiv.org/abs/2501.15757v1","updated":"2025-01-27T04:00:05Z","published":"2025-01-27T04:00:05Z","title":"Efficiency Bottlenecks of Convolutional Kolmogorov-Arnold Networks: A\n Comprehensive Scrutiny with ImageNet, AlexNet, LeNet and Tabular\n Classification","summary":" Algorithmic level developments like Convolutional Neural Networks,\ntransformers, attention mechanism, Retrieval Augmented Generation and so on\nhave changed Artificial Intelligence. Recent such development was observed by\nKolmogorov-Arnold Networks that suggested to challenge the fundamental concept\nof a Neural Network, thus change Multilayer Perceptron, and Convolutional\nNeural Networks. They received a good reception in terms of scientific\nmodeling, yet had some drawbacks in terms of efficiency. In this paper, we\ntrain Convolutional Kolmogorov Arnold Networks (CKANs) with the ImageNet-1k\ndataset with 1.3 million images, MNIST dataset with 60k images and a tabular\nbiological science related MoA dataset and test the promise of CKANs in terms\nof FLOPS, Inference Time, number of trainable parameters and training time\nagainst the accuracy, precision, recall and f-1 score they produce against the\nstandard industry practice on CNN models. We show that the CKANs perform fair\nyet slower than CNNs in small size dataset like MoA and MNIST but are not\nnearly comparable as the dataset gets larger and more complex like the\nImageNet. The code implementation of this paper can be found on the link:\n\\href{https://github.com/ashimdahal/Study-of-Convolutional-Kolmogorov-Arnold-networks}{https://github.com/ashimdahal/Study-of-Convolutional-Kolmogorov-Arnold-networks}\n","authors":["Ashim Dahal","Saydul Akbar Murad","Nick Rahimi"],"pdf_url":"https://arxiv.org/pdf/2501.15757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09323v4","updated":"2025-01-27T03:44:24Z","published":"2024-03-14T12:12:17Z","title":"E2E-MFD: Towards End-to-End Synchronous Multimodal Fusion Detection","summary":" Multimodal image fusion and object detection are crucial for autonomous\ndriving. While current methods have advanced the fusion of texture details and\nsemantic information, their complex training processes hinder broader\napplications. Addressing this challenge, we introduce E2E-MFD, a novel\nend-to-end algorithm for multimodal fusion detection. E2E-MFD streamlines the\nprocess, achieving high performance with a single training phase. It employs\nsynchronous joint optimization across components to avoid suboptimal solutions\ntied to individual tasks. Furthermore, it implements a comprehensive\noptimization strategy in the gradient matrix for shared parameters, ensuring\nconvergence to an optimal fusion detection configuration. Our extensive testing\non multiple public datasets reveals E2E-MFD's superior capabilities, showcasing\nnot only visually appealing image fusion but also impressive detection\noutcomes, such as a 3.9% and 2.0% mAP50 increase on horizontal object detection\ndataset M3FD and oriented object detection dataset DroneVehicle, respectively,\ncompared to state-of-the-art approaches. The code is released at\nhttps://github.com/icey-zhang/E2E-MFD.\n","authors":["Jiaqing Zhang","Mingxiang Cao","Weiying Xie","Jie Lei","Daixun Li","Wenbo Huang","Yunsong Li","Xue Yang"],"pdf_url":"https://arxiv.org/pdf/2403.09323v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00684v2","updated":"2025-01-27T03:28:31Z","published":"2024-06-02T09:42:23Z","title":"Deciphering Oracle Bone Language with Diffusion Models","summary":" Originating from China's Shang Dynasty approximately 3,000 years ago, the\nOracle Bone Script (OBS) is a cornerstone in the annals of linguistic history,\npredating many established writing systems. Despite the discovery of thousands\nof inscriptions, a vast expanse of OBS remains undeciphered, casting a veil of\nmystery over this ancient language. The emergence of modern AI technologies\npresents a novel frontier for OBS decipherment, challenging traditional NLP\nmethods that rely heavily on large textual corpora, a luxury not afforded by\nhistorical languages. This paper introduces a novel approach by adopting image\ngeneration techniques, specifically through the development of Oracle Bone\nScript Decipher (OBSD). Utilizing a conditional diffusion-based strategy, OBSD\ngenerates vital clues for decipherment, charting a new course for AI-assisted\nanalysis of ancient languages. To validate its efficacy, extensive experiments\nwere conducted on an oracle bone script dataset, with quantitative results\ndemonstrating the effectiveness of OBSD. Code and decipherment results will be\nmade available at https://github.com/guanhaisu/OBSD.\n","authors":["Haisu Guan","Huanxin Yang","Xinyu Wang","Shengwei Han","Yongge Liu","Lianwen Jin","Xiang Bai","Yuliang Liu"],"pdf_url":"https://arxiv.org/pdf/2406.00684v2.pdf","comment":"ACL 2024 Best Paper"},{"id":"http://arxiv.org/abs/2501.15743v1","updated":"2025-01-27T03:09:58Z","published":"2025-01-27T03:09:58Z","title":"Z-Stack Scanning can Improve AI Detection of Mitosis: A Case Study of\n Meningiomas","summary":" Z-stack scanning is an emerging whole slide imaging technology that captures\nmultiple focal planes alongside the z-axis of a glass slide. Because z-stacking\ncan offer enhanced depth information compared to the single-layer whole slide\nimaging, this technology can be particularly useful in analyzing small-scaled\nhistopathological patterns. However, its actual clinical impact remains debated\nwith mixed results. To clarify this, we investigate the effect of z-stack\nscanning on artificial intelligence (AI) mitosis detection of meningiomas. With\nthe same set of 22 Hematoxylin and Eosin meningioma glass slides scanned by\nthree different digital pathology scanners, we tested the performance of three\nAI pipelines on both single-layer and z-stacked whole slide images (WSIs).\nResults showed that in all scanner-AI combinations, z-stacked WSIs\nsignificantly increased AI's sensitivity (+17.14%) on the mitosis detection\nwith only a marginal impact on precision. Our findings provide quantitative\nevidence that highlights z-stack scanning as a promising technique for AI\nmitosis detection, paving the way for more reliable AI-assisted pathology\nworkflows, which can ultimately benefit patient management.\n","authors":["Hongyan Gu","Ellie Onstott","Wenzhong Yan","Tengyou Xu","Ruolin Wang","Zida Wu","Xiang 'Anthony' Chen","Mohammad Haeri"],"pdf_url":"https://arxiv.org/pdf/2501.15743v1.pdf","comment":"To appear 2025 IEEE 22nd International Symposium on Biomedical\n Imaging (ISBI)"},{"id":"http://arxiv.org/abs/2501.14317v2","updated":"2025-01-27T02:45:47Z","published":"2025-01-24T08:22:02Z","title":"Nautilus: Locality-aware Autoencoder for Scalable Mesh Generation","summary":" Triangle meshes are fundamental to 3D applications, enabling efficient\nmodification and rasterization while maintaining compatibility with standard\nrendering pipelines. However, current automatic mesh generation methods\ntypically rely on intermediate representations that lack the continuous surface\nquality inherent to meshes. Converting these representations into meshes\nproduces dense, suboptimal outputs. Although recent autoregressive approaches\ndemonstrate promise in directly modeling mesh vertices and faces, they are\nconstrained by the limitation in face count, scalability, and structural\nfidelity. To address these challenges, we propose Nautilus, a locality-aware\nautoencoder for artist-like mesh generation that leverages the local properties\nof manifold meshes to achieve structural fidelity and efficient representation.\nOur approach introduces a novel tokenization algorithm that preserves face\nproximity relationships and compresses sequence length through locally shared\nvertices and edges, enabling the generation of meshes with an unprecedented\nscale of up to 5,000 faces. Furthermore, we develop a Dual-stream Point\nConditioner that provides multi-scale geometric guidance, ensuring global\nconsistency and local structural fidelity by capturing fine-grained geometric\nfeatures. Extensive experiments demonstrate that Nautilus significantly\noutperforms state-of-the-art methods in both fidelity and scalability. The\nproject page will be released to https://nautilusmeshgen.github.io.\n","authors":["Yuxuan Wang","Xuanyu Yi","Haohan Weng","Qingshan Xu","Xiaokang Wei","Xianghui Yang","Chunchao Guo","Long Chen","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.14317v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2501.15733v1","updated":"2025-01-27T02:18:08Z","published":"2025-01-27T02:18:08Z","title":"Leveraging Video Vision Transformer for Alzheimer's Disease Diagnosis\n from 3D Brain MRI","summary":" Alzheimer's disease (AD) is a neurodegenerative disorder affecting millions\nworldwide, necessitating early and accurate diagnosis for optimal patient\nmanagement. In recent years, advancements in deep learning have shown\nremarkable potential in medical image analysis. Methods In this study, we\npresent \"ViTranZheimer,\" an AD diagnosis approach which leverages video vision\ntransformers to analyze 3D brain MRI data. By treating the 3D MRI volumes as\nvideos, we exploit the temporal dependencies between slices to capture\nintricate structural relationships. The video vision transformer's\nself-attention mechanisms enable the model to learn long-range dependencies and\nidentify subtle patterns that may indicate AD progression. Our proposed deep\nlearning framework seeks to enhance the accuracy and sensitivity of AD\ndiagnosis, empowering clinicians with a tool for early detection and\nintervention. We validate the performance of the video vision transformer using\nthe ADNI dataset and conduct comparative analyses with other relevant models.\nResults The proposed ViTranZheimer model is compared with two hybrid models,\nCNN-BiLSTM and ViT-BiLSTM. CNN-BiLSTM is the combination of a convolutional\nneural network (CNN) and a bidirectional long-short-term memory network\n(BiLSTM), while ViT-BiLSTM is the combination of a vision transformer (ViT)\nwith BiLSTM. The accuracy levels achieved in the ViTranZheimer, CNN-BiLSTM, and\nViT-BiLSTM models are 98.6%, 96.479%, and 97.465%, respectively. ViTranZheimer\ndemonstrated the highest accuracy at 98.6%, outperforming other models in this\nevaluation metric, indicating its superior performance in this specific\nevaluation metric. Conclusion This research advances the understanding of\napplying deep learning techniques in neuroimaging and Alzheimer's disease\nresearch, paving the way for earlier and less invasive clinical diagnosis.\n","authors":["Taymaz Akan","Sait Alp","Md. Shenuarin Bhuiyan","Elizabeth A. Disbrow","Steven A. Conrad","John A. Vanchiere","Christopher G. Kevil","Mohammad A. N. Bhuiyan"],"pdf_url":"https://arxiv.org/pdf/2501.15733v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.06234v2","updated":"2025-01-27T01:45:15Z","published":"2024-10-08T17:45:51Z","title":"TEOChat: A Large Vision-Language Assistant for Temporal Earth\n Observation Data","summary":" Large vision and language assistants have enabled new capabilities for\ninterpreting natural images. These approaches have recently been adapted to\nearth observation data, but they are only able to handle single image inputs,\nlimiting their use for many real-world tasks. In this work, we develop a new\nvision and language assistant called TEOChat that can engage in conversations\nabout temporal sequences of earth observation data. To train TEOChat, we curate\nan instruction-following dataset composed of many single image and temporal\ntasks including building change and damage assessment, semantic change\ndetection, and temporal scene classification. We show that TEOChat can perform\na wide variety of spatial and temporal reasoning tasks, substantially\noutperforming previous vision and language assistants, and even achieving\ncomparable or better performance than several specialist models trained to\nperform specific tasks. Furthermore, TEOChat achieves impressive zero-shot\nperformance on a change detection and change question answering dataset,\noutperforms GPT-4o and Gemini 1.5 Pro on multiple temporal tasks, and exhibits\nstronger single image capabilities than a comparable single image\ninstruction-following model on scene classification, visual question answering,\nand captioning. We publicly release our data, model, and code at\nhttps://github.com/ermongroup/TEOChat .\n","authors":["Jeremy Andrew Irvin","Emily Ruoyu Liu","Joyce Chuyi Chen","Ines Dormoy","Jinyoung Kim","Samar Khanna","Zhuo Zheng","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2410.06234v2.pdf","comment":"Published at ICLR 2025"},{"id":"http://arxiv.org/abs/2501.15724v1","updated":"2025-01-27T01:27:59Z","published":"2025-01-27T01:27:59Z","title":"A Survey on Computational Pathology Foundation Models: Datasets,\n Adaptation Strategies, and Evaluation Tasks","summary":" Computational pathology foundation models (CPathFMs) have emerged as a\npowerful approach for analyzing histopathological data, leveraging\nself-supervised learning to extract robust feature representations from\nunlabeled whole-slide images. These models, categorized into uni-modal and\nmulti-modal frameworks, have demonstrated promise in automating complex\npathology tasks such as segmentation, classification, and biomarker discovery.\nHowever, the development of CPathFMs presents significant challenges, such as\nlimited data accessibility, high variability across datasets, the necessity for\ndomain-specific adaptation, and the lack of standardized evaluation benchmarks.\nThis survey provides a comprehensive review of CPathFMs in computational\npathology, focusing on datasets, adaptation strategies, and evaluation tasks.\nWe analyze key techniques, such as contrastive learning and multi-modal\nintegration, and highlight existing gaps in current research. Finally, we\nexplore future directions from four perspectives for advancing CPathFMs. This\nsurvey serves as a valuable resource for researchers, clinicians, and AI\npractitioners, guiding the advancement of CPathFMs toward robust and clinically\napplicable AI-driven pathology solutions.\n","authors":["Dong Li","Guihong Wan","Xintao Wu","Xinyu Wu","Ajit J. Nirmal","Christine G. Lian","Peter K. Sorger","Yevgeniy R. Semenov","Chen Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.15724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12040v6","updated":"2025-01-27T00:57:19Z","published":"2024-07-01T17:59:55Z","title":"Comprehensive Performance Evaluation of YOLO11, YOLOv10, YOLOv9 and\n YOLOv8 on Detecting and Counting Fruitlet in Complex Orchard Environments","summary":" This study extensively evaluated You Only Look Once (YOLO) object detection\nalgorithms across all configurations (total 22) of YOLOv8, YOLOv9, YOLOv10, and\nYOLO11 (or YOLOv11) for green fruit detection in commercial orchards. The\nresearch also validated in-field fruitlet counting using an iPhone and machine\nvision sensors across four apple varieties: Scifresh, Scilate, Honeycrisp and\nCosmic Crisp. Among the 22 configurations evaluated, YOLOv11s and YOLOv9\ngelan-base outperformed others with mAP@50 scores of 0.933 and 0.935\nrespectively. In terms of recall, YOLOv9 gelan-base achieved the highest value\namong YOLOv9 configurations at 0.899, while YOLOv11m led YOLOv11 variants with\n0.897. YOLO11n emerged as the fastest model, achieving fastest inference speed\nof only 2.4 ms, significantly outpacing the leading configurations of YOLOv10n,\nYOLOv9 gelan-s, and YOLOv8n, with speeds of 5.5, 11.5, and 4.1 ms,\nrespectively. This comparative evaluation highlights the strengths of YOLOv11,\nYOLOv9, and YOLOv10, offering researchers essential insights to choose the\nbest-suited model for fruitlet detection and possible automation in commercial\norchards. For real-time automation related work in relevant datasets, we\nrecommend using YOLOv11n due to its high detection and image processing speed.\nKeywords: YOLO11, YOLO11 Object Detection, YOLOv10, YOLOv9, YOLOv8, You Only\nLook Once, Fruitlet Detection, Greenfruit Detection, YOLOv11 bounding box,\nYOLOv11 detection, YOLOv11 object detection, YOLOv11 machine learning, YOLOv11\nDeep Learning\n","authors":["Ranjan Sapkota","Zhichao Meng","Martin Churuvija","Xiaoqiang Du","Zenghong Ma","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2407.12040v6.pdf","comment":"15 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.15712v1","updated":"2025-01-27T00:31:30Z","published":"2025-01-27T00:31:30Z","title":"SeqSeg: Learning Local Segments for Automatic Vascular Model\n Construction","summary":" Computational modeling of cardiovascular function has become a critical part\nof diagnosing, treating and understanding cardiovascular disease. Most\nstrategies involve constructing anatomically accurate computer models of\ncardiovascular structures, which is a multistep, time-consuming process. To\nimprove the model generation process, we herein present SeqSeg (sequential\nsegmentation): a novel deep learning based automatic tracing and segmentation\nalgorithm for constructing image-based vascular models. SeqSeg leverages local\nU-Net-based inference to sequentially segment vascular structures from medical\nimage volumes. We tested SeqSeg on CT and MR images of aortic and aortofemoral\nmodels and compared the predictions to those of benchmark 2D and 3D global\nnnU-Net models, which have previously shown excellent accuracy for medical\nimage segmentation. We demonstrate that SeqSeg is able to segment more complete\nvasculature and is able to generalize to vascular structures not annotated in\nthe training data.\n","authors":["Numi Sveinsson Cepero","Shawn C. Shadden"],"pdf_url":"https://arxiv.org/pdf/2501.15712v1.pdf","comment":"32 pages, 12 figures. Ann Biomed Eng (2024)"},{"id":"http://arxiv.org/abs/2501.16583v1","updated":"2025-01-27T23:53:49Z","published":"2025-01-27T23:53:49Z","title":"Directing Mamba to Complex Textures: An Efficient Texture-Aware State\n Space Model for Image Restoration","summary":" Image restoration aims to recover details and enhance contrast in degraded\nimages. With the growing demand for high-quality imaging (\\textit{e.g.}, 4K and\n8K), achieving a balance between restoration quality and computational\nefficiency has become increasingly critical. Existing methods, primarily based\non CNNs, Transformers, or their hybrid approaches, apply uniform deep\nrepresentation extraction across the image. However, these methods often\nstruggle to effectively model long-range dependencies and largely overlook the\nspatial characteristics of image degradation (regions with richer textures tend\nto suffer more severe damage), making it hard to achieve the best trade-off\nbetween restoration quality and efficiency. To address these issues, we propose\na novel texture-aware image restoration method, TAMambaIR, which simultaneously\nperceives image textures and achieves a trade-off between performance and\nefficiency. Specifically, we introduce a novel Texture-Aware State Space Model,\nwhich enhances texture awareness and improves efficiency by modulating the\ntransition matrix of the state-space equation and focusing on regions with\ncomplex textures. Additionally, we design a {Multi-Directional Perception\nBlock} to improve multi-directional receptive fields while maintaining low\ncomputational overhead. Extensive experiments on benchmarks for image\nsuper-resolution, deraining, and low-light image enhancement demonstrate that\nTAMambaIR achieves state-of-the-art performance with significantly improved\nefficiency, establishing it as a robust and efficient framework for image\nrestoration.\n","authors":["Long Peng","Xin Di","Zhanfeng Feng","Wenbo Li","Renjing Pei","Yang Wang","Xueyang Fu","Yang Cao","Zheng-Jun Zha"],"pdf_url":"https://arxiv.org/pdf/2501.16583v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2412.10587v2","updated":"2025-01-27T23:46:06Z","published":"2024-12-13T22:02:48Z","title":"Evaluation of GPT-4o and GPT-4o-mini's Vision Capabilities for\n Compositional Analysis from Dried Solution Drops","summary":" When microliter drops of salt solutions dry on non-porous surfaces, they form\nerratic yet characteristic deposit patterns influenced by complex\ncrystallization dynamics and fluid motion. Using OpenAI's image-enabled\nlanguage models, we analyzed deposits from 12 salts with 200 images per salt\nand per model. GPT-4o classified 57% of the salts accurately, significantly\noutperforming random chance and GPT-4o mini. This study underscores the promise\nof general-use AI tools for reliably identifying salts from their drying\npatterns.\n","authors":["Deven B. Dangi","Beni B. Dangi","Oliver Steinbock"],"pdf_url":"https://arxiv.org/pdf/2412.10587v2.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2405.20981v2","updated":"2025-01-27T23:41:27Z","published":"2024-05-31T16:26:30Z","title":"Generative Adversarial Networks in Ultrasound Imaging: Extending Field\n of View Beyond Conventional Limits","summary":" Transthoracic Echocardiography (TTE) is a fundamental, non-invasive\ndiagnostic tool in cardiovascular medicine, enabling detailed visualization of\ncardiac structures crucial for diagnosing various heart conditions. Despite its\nwidespread use, TTE ultrasound imaging faces inherent limitations, notably the\ntrade-off between field of view (FoV) and resolution. This paper introduces a\nnovel application of conditional Generative Adversarial Networks (cGANs),\nspecifically designed to extend the FoV in TTE ultrasound imaging while\nmaintaining high resolution. Our proposed cGAN architecture, termed echoGAN,\ndemonstrates the capability to generate realistic anatomical structures through\noutpainting, effectively broadening the viewable area in medical imaging. This\nadvancement has the potential to enhance both automatic and manual ultrasound\nnavigation, offering a more comprehensive view that could significantly reduce\nthe learning curve associated with ultrasound imaging and aid in more accurate\ndiagnoses. The results confirm that echoGAN reliably reproduce detailed cardiac\nfeatures, thereby promising a significant step forward in the field of\nnon-invasive cardiac naviagation and diagnostics.\n","authors":["Matej Gazda","Samuel Kadoury","Jakub Gazda","Peter Drotar"],"pdf_url":"https://arxiv.org/pdf/2405.20981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16571v1","updated":"2025-01-27T23:31:39Z","published":"2025-01-27T23:31:39Z","title":"Efficient Object Detection of Marine Debris using Pruned YOLO Model","summary":" Marine debris poses significant harm to marine life due to substances like\nmicroplastics, polychlorinated biphenyls, and pesticides, which damage habitats\nand poison organisms. Human-based solutions, such as diving, are increasingly\nineffective in addressing this issue. Autonomous underwater vehicles (AUVs) are\nbeing developed for efficient sea garbage collection, with the choice of object\ndetection architecture being critical. This research employs the YOLOv4 model\nfor real-time detection of marine debris using the Trash-ICRA 19 dataset,\nconsisting of 7683 images at 480x320 pixels. Various modifications-pretrained\nmodels, training from scratch, mosaic augmentation, layer freezing,\nYOLOv4-tiny, and channel pruning-are compared to enhance architecture\nefficiency. Channel pruning significantly improves detection speed, increasing\nthe base YOLOv4 frame rate from 15.19 FPS to 19.4 FPS, with only a 1.2% drop in\nmean Average Precision, from 97.6% to 96.4%.\n","authors":["Abi Aryaza","Novanto Yudistira"," Tibyani"],"pdf_url":"https://arxiv.org/pdf/2501.16571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16559v1","updated":"2025-01-27T23:02:24Z","published":"2025-01-27T23:02:24Z","title":"LoRA-X: Bridging Foundation Models with Training-Free Cross-Model\n Adaptation","summary":" The rising popularity of large foundation models has led to a heightened\ndemand for parameter-efficient fine-tuning methods, such as Low-Rank Adaptation\n(LoRA), which offer performance comparable to full model fine-tuning while\nrequiring only a few additional parameters tailored to the specific base model.\nWhen such base models are deprecated and replaced, all associated LoRA modules\nmust be retrained, requiring access to either the original training data or a\nsubstantial amount of synthetic data that mirrors the original distribution.\nHowever, the original data is often inaccessible due to privacy or licensing\nissues, and generating synthetic data may be impractical and insufficiently\nrepresentative. These factors complicate the fine-tuning process considerably.\nTo address this challenge, we introduce a new adapter, Cross-Model Low-Rank\nAdaptation (LoRA-X), which enables the training-free transfer of LoRA\nparameters across source and target models, eliminating the need for original\nor synthetic training data. Our approach imposes the adapter to operate within\nthe subspace of the source base model. This constraint is necessary because our\nprior knowledge of the target model is limited to its weights, and the criteria\nfor ensuring the adapter's transferability are restricted to the target base\nmodel's weights and subspace. To facilitate the transfer of LoRA parameters of\nthe source model to a target model, we employ the adapter only in the layers of\nthe target model that exhibit an acceptable level of subspace similarity. Our\nextensive experiments demonstrate the effectiveness of LoRA-X for text-to-image\ngeneration, including Stable Diffusion v1.5 and Stable Diffusion XL.\n","authors":["Farzad Farhadzadeh","Debasmit Das","Shubhankar Borse","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2501.16559v1.pdf","comment":"Accepted to ICLR 2025"},{"id":"http://arxiv.org/abs/2501.16551v1","updated":"2025-01-27T22:51:45Z","published":"2025-01-27T22:51:45Z","title":"PackDiT: Joint Human Motion and Text Generation via Mutual Prompting","summary":" Human motion generation has advanced markedly with the advent of diffusion\nmodels. Most recent studies have concentrated on generating motion sequences\nbased on text prompts, commonly referred to as text-to-motion generation.\nHowever, the bidirectional generation of motion and text, enabling tasks such\nas motion-to-text alongside text-to-motion, has been largely unexplored. This\ncapability is essential for aligning diverse modalities and supports\nunconditional generation. In this paper, we introduce PackDiT, the first\ndiffusion-based generative model capable of performing various tasks\nsimultaneously, including motion generation, motion prediction, text\ngeneration, text-to-motion, motion-to-text, and joint motion-text generation.\nOur core innovation leverages mutual blocks to integrate multiple diffusion\ntransformers (DiTs) across different modalities seamlessly. We train PackDiT on\nthe HumanML3D dataset, achieving state-of-the-art text-to-motion performance\nwith an FID score of 0.106, along with superior results in motion prediction\nand in-between tasks. Our experiments further demonstrate that diffusion models\nare effective for motion-to-text generation, achieving performance comparable\nto that of autoregressive models.\n","authors":["Zhongyu Jiang","Wenhao Chai","Zhuoran Zhou","Cheng-Yen Yang","Hsiang-Wei Huang","Jenq-Neng Hwang"],"pdf_url":"https://arxiv.org/pdf/2501.16551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16550v1","updated":"2025-01-27T22:48:36Z","published":"2025-01-27T22:48:36Z","title":"PhysAnimator: Physics-Guided Generative Cartoon Animation","summary":" Creating hand-drawn animation sequences is labor-intensive and demands\nprofessional expertise. We introduce PhysAnimator, a novel approach for\ngenerating physically plausible meanwhile anime-stylized animation from static\nanime illustrations. Our method seamlessly integrates physics-based simulations\nwith data-driven generative models to produce dynamic and visually compelling\nanimations. To capture the fluidity and exaggeration characteristic of anime,\nwe perform image-space deformable body simulations on extracted mesh\ngeometries. We enhance artistic control by introducing customizable energy\nstrokes and incorporating rigging point support, enabling the creation of\ntailored animation effects such as wind interactions. Finally, we extract and\nwarp sketches from the simulation sequence, generating a texture-agnostic\nrepresentation, and employ a sketch-guided video diffusion model to synthesize\nhigh-quality animation frames. The resulting animations exhibit temporal\nconsistency and visual plausibility, demonstrating the effectiveness of our\nmethod in creating dynamic anime-style animations.\n","authors":["Tianyi Xie","Yiwei Zhao","Ying Jiang","Chenfanfu Jiang"],"pdf_url":"https://arxiv.org/pdf/2501.16550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14028v2","updated":"2025-01-27T22:03:44Z","published":"2024-09-21T06:08:23Z","title":"MSDet: Receptive Field Enhanced Multiscale Detection for Tiny Pulmonary\n Nodule","summary":" Pulmonary nodules are critical indicators for the early diagnosis of lung\ncancer, making their detection essential for timely treatment. However,\ntraditional CT imaging methods suffered from cumbersome procedures, low\ndetection rates, and poor localization accuracy. The subtle differences between\npulmonary nodules and surrounding tissues in complex lung CT images, combined\nwith repeated downsampling in feature extraction networks, often lead to missed\nor false detections of small nodules. Existing methods such as FPN, with its\nfixed feature fusion and limited receptive field, struggle to effectively\novercome these issues. To address these challenges, our paper proposed three\nkey contributions: Firstly, we proposed MSDet, a multiscale attention and\nreceptive field network for detecting tiny pulmonary nodules. Secondly, we\nproposed the extended receptive domain (ERD) strategy to capture richer\ncontextual information and reduce false positives caused by nodule occlusion.\nWe also proposed the position channel attention mechanism (PCAM) to optimize\nfeature learning and reduce multiscale detection errors, and designed the tiny\nobject detection block (TODB) to enhance the detection of tiny nodules. Lastly,\nwe conducted thorough experiments on the public LUNA16 dataset, achieving\nstate-of-the-art performance, with an mAP improvement of 8.8% over the previous\nstate-of-the-art method YOLOv8. These advancements significantly boosted\ndetection accuracy and reliability, providing a more effective solution for\nearly lung cancer diagnosis. The code will be available at\nhttps://github.com/CaiGuoHui123/MSDet\n","authors":["Guohui Cai","Ruicheng Zhang","Hongyang He","Zeyu Zhang","Daji Ergu","Yuanzhouhan Cao","Jinman Zhao","Binbin Hu","Zhinbin Liao","Yang Zhao","Ying Cai"],"pdf_url":"https://arxiv.org/pdf/2409.14028v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16525v1","updated":"2025-01-27T21:50:12Z","published":"2025-01-27T21:50:12Z","title":"Multi-Objective Deep-Learning-based Biomechanical Deformable Image\n Registration with MOREA","summary":" When choosing a deformable image registration (DIR) approach for images with\nlarge deformations and content mismatch, the realism of found transformations\noften needs to be traded off against the required runtime. DIR approaches using\ndeep learning (DL) techniques have shown remarkable promise in instantly\npredicting a transformation. However, on difficult registration problems, the\nrealism of these transformations can fall short. DIR approaches using\nbiomechanical, finite element modeling (FEM) techniques can find more realistic\ntransformations, but tend to require much longer runtimes. This work proposes\nthe first hybrid approach to combine them, with the aim of getting the best of\nboth worlds. This hybrid approach, called DL-MOREA, combines a recently\nintroduced multi-objective DL-based DIR approach which leverages the VoxelMorph\nframework, called DL-MODIR, with MOREA, an evolutionary algorithm-based,\nmulti-objective DIR approach in which a FEM-like biomechanical mesh\ntransformation model is used. In our proposed hybrid approach, the DL results\nare used to smartly initialize MOREA, with the aim of more efficiently\noptimizing its mesh transformation model. We empirically compare DL-MOREA\nagainst its components, DL-MODIR and MOREA, on CT scan pairs capturing large\nbladder filling differences of 15 cervical cancer patients. While MOREA\nrequires a median runtime of 45 minutes, DL-MOREA can already find high-quality\ntransformations after 5 minutes. Compared to the DL-MODIR transformations, the\ntransformations found by DL-MOREA exhibit far less folding and improve or\npreserve the bladder contour distance error.\n","authors":["Georgios Andreadis","Eduard Ruiz Munné","Thomas H. W. Bäck","Peter A. N. Bosman","Tanja Alderliesten"],"pdf_url":"https://arxiv.org/pdf/2501.16525v1.pdf","comment":"Pre-print for the SPIE Medical Imaging: Image Processing Conference"},{"id":"http://arxiv.org/abs/2412.16050v4","updated":"2025-01-27T21:13:10Z","published":"2024-12-20T16:52:11Z","title":"Label-Efficient Data Augmentation with Video Diffusion Models for\n Guidewire Segmentation in Cardiac Fluoroscopy","summary":" The accurate segmentation of guidewires in interventional cardiac fluoroscopy\nvideos is crucial for computer-aided navigation tasks. Although deep learning\nmethods have demonstrated high accuracy and robustness in wire segmentation,\nthey require substantial annotated datasets for generalizability, underscoring\nthe need for extensive labeled data to enhance model performance. To address\nthis challenge, we propose the Segmentation-guided Frame-consistency Video\nDiffusion Model (SF-VD) to generate large collections of labeled fluoroscopy\nvideos, augmenting the training data for wire segmentation networks. SF-VD\nleverages videos with limited annotations by independently modeling scene\ndistribution and motion distribution. It first samples the scene distribution\nby generating 2D fluoroscopy images with wires positioned according to a\nspecified input mask, and then samples the motion distribution by progressively\ngenerating subsequent frames, ensuring frame-to-frame coherence through a\nframe-consistency strategy. A segmentation-guided mechanism further refines the\nprocess by adjusting wire contrast, ensuring a diverse range of visibility in\nthe synthesized image. Evaluation on a fluoroscopy dataset confirms the\nsuperior quality of the generated videos and shows significant improvements in\nguidewire segmentation.\n","authors":["Shaoyan Pan","Yikang Liu","Lin Zhao","Eric Z. Chen","Xiao Chen","Terrence Chen","Shanhui Sun"],"pdf_url":"https://arxiv.org/pdf/2412.16050v4.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2410.04492v5","updated":"2025-01-27T20:34:06Z","published":"2024-10-06T14:11:39Z","title":"Interpret Your Decision: Logical Reasoning Regularization for\n Generalization in Visual Classification","summary":" Vision models excel in image classification but struggle to generalize to\nunseen data, such as classifying images from unseen domains or discovering\nnovel categories. In this paper, we explore the relationship between logical\nreasoning and deep learning generalization in visual classification. A logical\nregularization termed L-Reg is derived which bridges a logical analysis\nframework to image classification. Our work reveals that L-Reg reduces the\ncomplexity of the model in terms of the feature distribution and classifier\nweights. Specifically, we unveil the interpretability brought by L-Reg, as it\nenables the model to extract the salient features, such as faces to persons,\nfor classification. Theoretical analysis and experiments demonstrate that L-Reg\nenhances generalization across various scenarios, including multi-domain\ngeneralization and generalized category discovery. In complex real-world\nscenarios where images span unknown classes and unseen domains, L-Reg\nconsistently improves generalization, highlighting its practical efficacy.\n","authors":["Zhaorui Tan","Xi Yang","Qiufeng Wang","Anh Nguyen","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2410.04492v5.pdf","comment":"Accepted by NeurIPS2024 as Spotlight"},{"id":"http://arxiv.org/abs/2501.16481v1","updated":"2025-01-27T20:28:01Z","published":"2025-01-27T20:28:01Z","title":"Generating customized prompts for Zero-Shot Rare Event Medical Image\n Classification using LLM","summary":" Rare events, due to their infrequent occurrences, do not have much data, and\nhence deep learning techniques fail in estimating the distribution for such\ndata. Open-vocabulary models represent an innovative approach to image\nclassification. Unlike traditional models, these models classify images into\nany set of categories specified with natural language prompts during inference.\nThese prompts usually comprise manually crafted templates (e.g., 'a photo of a\n{}') that are filled in with the names of each category. This paper introduces\na simple yet effective method for generating highly accurate and contextually\ndescriptive prompts containing discriminative characteristics. Rare event\ndetection, especially in medicine, is more challenging due to low inter-class\nand high intra-class variability. To address these, we propose a novel approach\nthat uses domain-specific expert knowledge on rare events to generate\ncustomized and contextually relevant prompts, which are then used by large\nlanguage models for image classification. Our zero-shot, privacy-preserving\nmethod enhances rare event classification without additional training,\noutperforming state-of-the-art techniques.\n","authors":["Payal Kamboj","Ayan Banerjee","Bin Xu","Sandeep Gupta"],"pdf_url":"https://arxiv.org/pdf/2501.16481v1.pdf","comment":"Accepted in IEEE ISBI, 2025"},{"id":"http://arxiv.org/abs/2202.13239v3","updated":"2025-01-27T20:09:00Z","published":"2022-02-26T22:27:36Z","title":"QOC: Quantum On-Chip Training with Parameter Shift and Gradient Pruning","summary":" Parameterized Quantum Circuits (PQC) are drawing increasing research interest\nthanks to its potential to achieve quantum advantages on near-term Noisy\nIntermediate Scale Quantum (NISQ) hardware. In order to achieve scalable PQC\nlearning, the training process needs to be offloaded to real quantum machines\ninstead of using exponential-cost classical simulators. One common approach to\nobtain PQC gradients is parameter shift whose cost scales linearly with the\nnumber of qubits. We present QOC, the first experimental demonstration of\npractical on-chip PQC training with parameter shift. Nevertheless, we find that\ndue to the significant quantum errors (noises) on real machines, gradients\nobtained from naive parameter shift have low fidelity and thus degrading the\ntraining accuracy. To this end, we further propose probabilistic gradient\npruning to firstly identify gradients with potentially large errors and then\nremove them. Specifically, small gradients have larger relative errors than\nlarge ones, thus having a higher probability to be pruned. We perform extensive\nexperiments with the Quantum Neural Network (QNN) benchmarks on 5\nclassification tasks using 5 real quantum machines. The results demonstrate\nthat our on-chip training achieves over 90% and 60% accuracy for 2-class and\n4-class image classification tasks. The probabilistic gradient pruning brings\nup to 7% PQC accuracy improvements over no pruning. Overall, we successfully\nobtain similar on-chip training accuracy compared with noise-free simulation\nbut have much better training scalability. The QOC code is available in the\nTorchQuantum library.\n","authors":["Hanrui Wang","Zirui Li","Jiaqi Gu","Yongshan Ding","David Z. Pan","Song Han"],"pdf_url":"https://arxiv.org/pdf/2202.13239v3.pdf","comment":"Published as a conference paper in DAC 2022; 7 pages, 8 figures;\n open-source at https://github.com/mit-han-lab/torchquantum"},{"id":"http://arxiv.org/abs/2406.17720v2","updated":"2025-01-27T20:06:18Z","published":"2024-06-25T17:09:54Z","title":"BioTrove: A Large Curated Image Dataset Enabling AI for Biodiversity","summary":" We introduce BioTrove, the largest publicly accessible dataset designed to\nadvance AI applications in biodiversity. Curated from the iNaturalist platform\nand vetted to include only research-grade data, BioTrove contains 161.9 million\nimages, offering unprecedented scale and diversity from three primary kingdoms:\nAnimalia (\"animals\"), Fungi (\"fungi\"), and Plantae (\"plants\"), spanning\napproximately 366.6K species. Each image is annotated with scientific names,\ntaxonomic hierarchies, and common names, providing rich metadata to support\naccurate AI model development across diverse species and ecosystems.\n We demonstrate the value of BioTrove by releasing a suite of CLIP models\ntrained using a subset of 40 million captioned images, known as BioTrove-Train.\nThis subset focuses on seven categories within the dataset that are\nunderrepresented in standard image recognition models, selected for their\ncritical role in biodiversity and agriculture: Aves (\"birds\"), Arachnida\n(\"spiders/ticks/mites\"), Insecta (\"insects\"), Plantae (\"plants\"), Fungi\n(\"fungi\"), Mollusca (\"snails\"), and Reptilia (\"snakes/lizards\"). To support\nrigorous assessment, we introduce several new benchmarks and report model\naccuracy for zero-shot learning across life stages, rare species, confounding\nspecies, and multiple taxonomic levels.\n We anticipate that BioTrove will spur the development of AI models capable of\nsupporting digital tools for pest control, crop monitoring, biodiversity\nassessment, and environmental conservation. These advancements are crucial for\nensuring food security, preserving ecosystems, and mitigating the impacts of\nclimate change. BioTrove is publicly available, easily accessible, and ready\nfor immediate use.\n","authors":["Chih-Hsuan Yang","Benjamin Feuer","Zaki Jubery","Zi K. Deng","Andre Nakkab","Md Zahid Hasan","Shivani Chiranjeevi","Kelly Marshall","Nirmal Baishnab","Asheesh K Singh","Arti Singh","Soumik Sarkar","Nirav Merchant","Chinmay Hegde","Baskar Ganapathysubramanian"],"pdf_url":"https://arxiv.org/pdf/2406.17720v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16469v1","updated":"2025-01-27T20:02:53Z","published":"2025-01-27T20:02:53Z","title":"Object Detection for Medical Image Analysis: Insights from the RT-DETR\n Model","summary":" Deep learning has emerged as a transformative approach for solving complex\npattern recognition and object detection challenges. This paper focuses on the\napplication of a novel detection framework based on the RT-DETR model for\nanalyzing intricate image data, particularly in areas such as diabetic\nretinopathy detection. Diabetic retinopathy, a leading cause of vision loss\nglobally, requires accurate and efficient image analysis to identify\nearly-stage lesions. The proposed RT-DETR model, built on a Transformer-based\narchitecture, excels at processing high-dimensional and complex visual data\nwith enhanced robustness and accuracy. Comparative evaluations with models such\nas YOLOv5, YOLOv8, SSD, and DETR demonstrate that RT-DETR achieves superior\nperformance across precision, recall, mAP50, and mAP50-95 metrics, particularly\nin detecting small-scale objects and densely packed targets. This study\nunderscores the potential of Transformer-based models like RT-DETR for\nadvancing object detection tasks, offering promising applications in medical\nimaging and beyond.\n","authors":["Weijie He","Yuwei Zhang","Ting Xu","Tai An","Yingbin Liang","Bo Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.16469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16467v1","updated":"2025-01-27T20:02:12Z","published":"2025-01-27T20:02:12Z","title":"Cross-Domain Semantic Segmentation with Large Language Model-Assisted\n Descriptor Generation","summary":" Semantic segmentation plays a crucial role in enabling machines to understand\nand interpret visual scenes at a pixel level. While traditional segmentation\nmethods have achieved remarkable success, their generalization to diverse\nscenes and unseen object categories remains limited. Recent advancements in\nlarge language models (LLMs) offer a promising avenue for bridging visual and\ntextual modalities, providing a deeper understanding of semantic relationships.\nIn this paper, we propose LangSeg, a novel LLM-guided semantic segmentation\nmethod that leverages context-sensitive, fine-grained subclass descriptors\ngenerated by LLMs. Our framework integrates these descriptors with a\npre-trained Vision Transformer (ViT) to achieve superior segmentation\nperformance without extensive model retraining. We evaluate LangSeg on two\nchallenging datasets, ADE20K and COCO-Stuff, where it outperforms\nstate-of-the-art models, achieving up to a 6.1% improvement in mean\nIntersection over Union (mIoU). Additionally, we conduct a comprehensive\nablation study and human evaluation to validate the effectiveness of our method\nin real-world scenarios. The results demonstrate that LangSeg not only excels\nin semantic understanding and contextual alignment but also provides a flexible\nand efficient framework for language-guided segmentation tasks. This approach\nopens up new possibilities for interactive and domain-specific segmentation\napplications.\n","authors":["Philip Hughes","Larry Burns","Luke Adams"],"pdf_url":"https://arxiv.org/pdf/2501.16467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.12477v2","updated":"2025-01-27T19:53:35Z","published":"2025-01-21T19:59:22Z","title":"Slot-BERT: Self-supervised Object Discovery in Surgical Video","summary":" Object-centric slot attention is a powerful framework for unsupervised\nlearning of structured and explainable representations that can support\nreasoning about objects and actions, including in surgical videos. While\nconventional object-centric methods for videos leverage recurrent processing to\nachieve efficiency, they often struggle with maintaining long-range temporal\ncoherence required for long videos in surgical applications. On the other hand,\nfully parallel processing of entire videos enhances temporal consistency but\nintroduces significant computational overhead, making it impractical for\nimplementation on hardware in medical facilities. We present Slot-BERT, a\nbidirectional long-range model that learns object-centric representations in a\nlatent space while ensuring robust temporal coherence. Slot-BERT scales object\ndiscovery seamlessly to long videos of unconstrained lengths. A novel slot\ncontrastive loss further reduces redundancy and improves the representation\ndisentanglement by enhancing slot orthogonality. We evaluate Slot-BERT on\nreal-world surgical video datasets from abdominal, cholecystectomy, and\nthoracic procedures. Our method surpasses state-of-the-art object-centric\napproaches under unsupervised training achieving superior performance across\ndiverse domains. We also demonstrate efficient zero-shot domain adaptation to\ndata from diverse surgical specialties and databases.\n","authors":["Guiqiu Liao","Matjaz Jogan","Marcel Hussing","Kenta Nakahashi","Kazuhiro Yasufuku","Amin Madani","Eric Eaton","Daniel A. Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2501.12477v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16458v1","updated":"2025-01-27T19:37:18Z","published":"2025-01-27T19:37:18Z","title":"BiFold: Bimanual Cloth Folding with Language Guidance","summary":" Cloth folding is a complex task due to the inevitable self-occlusions of\nclothes, their complicated dynamics, and the disparate materials, geometries,\nand textures that garments can have. In this work, we learn folding actions\nconditioned on text commands. Translating high-level, abstract instructions\ninto precise robotic actions requires sophisticated language understanding and\nmanipulation capabilities. To do that, we leverage a pre-trained\nvision-language model and repurpose it to predict manipulation actions. Our\nmodel, BiFold, can take context into account and achieves state-of-the-art\nperformance on an existing language-conditioned folding benchmark. Given the\nlack of annotated bimanual folding data, we devise a procedure to automatically\nparse actions of a simulated dataset and tag them with aligned text\ninstructions. BiFold attains the best performance on our dataset and can\ntransfer to new instructions, garments, and environments.\n","authors":["Oriol Barbany","Adrià Colomé","Carme Torras"],"pdf_url":"https://arxiv.org/pdf/2501.16458v1.pdf","comment":"Accepted at ICRA 2025"},{"id":"http://arxiv.org/abs/2501.16443v1","updated":"2025-01-27T19:07:06Z","published":"2025-01-27T19:07:06Z","title":"Objects matter: object-centric world models improve reinforcement\n learning in visually complex environments","summary":" Deep reinforcement learning has achieved remarkable success in learning\ncontrol policies from pixels across a wide range of tasks, yet its application\nremains hindered by low sample efficiency, requiring significantly more\nenvironment interactions than humans to reach comparable performance.\nModel-based reinforcement learning (MBRL) offers a solution by leveraging\nlearnt world models to generate simulated experience, thereby improving sample\nefficiency. However, in visually complex environments, small or dynamic\nelements can be critical for decision-making. Yet, traditional MBRL methods in\npixel-based environments typically rely on auto-encoding with an $L_2$ loss,\nwhich is dominated by large areas and often fails to capture decision-relevant\ndetails. To address these limitations, we propose an object-centric MBRL\npipeline, which integrates recent advances in computer vision to allow agents\nto focus on key decision-related elements. Our approach consists of four main\nsteps: (1) annotating key objects related to rewards and goals with\nsegmentation masks, (2) extracting object features using a pre-trained, frozen\nfoundation vision model, (3) incorporating these object features with the raw\nobservations to predict environmental dynamics, and (4) training the policy\nusing imagined trajectories generated by this object-centric world model.\nBuilding on the efficient MBRL algorithm STORM, we call this pipeline OC-STORM.\nWe demonstrate OC-STORM's practical value in overcoming the limitations of\nconventional MBRL approaches on both Atari games and the visually complex game\nHollow Knight.\n","authors":["Weipu Zhang","Adam Jelley","Trevor McInroe","Amos Storkey"],"pdf_url":"https://arxiv.org/pdf/2501.16443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.04012v2","updated":"2025-01-27T19:02:05Z","published":"2024-10-05T03:02:47Z","title":"JAM: A Comprehensive Model for Age Estimation, Verification, and\n Comparability","summary":" This paper introduces a comprehensive model for age estimation, verification,\nand comparability, offering a comprehensive solution for a wide range of\napplications. It employs advanced learning techniques to understand age\ndistribution and uses confidence scores to create probabilistic age ranges,\nenhancing its ability to handle ambiguous cases. The model has been tested on\nboth proprietary and public datasets and compared against one of the\ntop-performing models in the field. Additionally, it has recently been\nevaluated by NIST as part of the FATE challenge, achieving top places in many\ncategories.\n","authors":["François David","Alexey A. Novikov","Ruslan Parkhomenko","Artem Voronin","Alix Melchy"],"pdf_url":"https://arxiv.org/pdf/2410.04012v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16411v1","updated":"2025-01-27T18:59:58Z","published":"2025-01-27T18:59:58Z","title":"PhysBench: Benchmarking and Enhancing Vision-Language Models for\n Physical World Understanding","summary":" Understanding the physical world is a fundamental challenge in embodied AI,\ncritical for enabling agents to perform complex tasks and operate safely in\nreal-world environments. While Vision-Language Models (VLMs) have shown great\npromise in reasoning and task planning for embodied agents, their ability to\ncomprehend physical phenomena remains extremely limited. To close this gap, we\nintroduce PhysBench, a comprehensive benchmark designed to evaluate VLMs'\nphysical world understanding capability across a diverse set of tasks.\nPhysBench contains 100,000 entries of interleaved video-image-text data,\ncategorized into four major domains: physical object properties, physical\nobject relationships, physical scene understanding, and physics-based dynamics,\nfurther divided into 19 subclasses and 8 distinct capability dimensions. Our\nextensive experiments, conducted on 75 representative VLMs, reveal that while\nthese models excel in common-sense reasoning, they struggle with understanding\nthe physical world -- likely due to the absence of physical knowledge in their\ntraining data and the lack of embedded physical priors. To tackle the\nshortfall, we introduce PhysAgent, a novel framework that combines the\ngeneralization strengths of VLMs with the specialized expertise of vision\nmodels, significantly enhancing VLMs' physical understanding across a variety\nof tasks, including an 18.4\\% improvement on GPT-4o. Furthermore, our results\ndemonstrate that enhancing VLMs' physical world understanding capabilities can\nhelp embodied agents such as MOKA. We believe that PhysBench and PhysAgent\noffer valuable insights and contribute to bridging the gap between VLMs and\nphysical world understanding.\n","authors":["Wei Chow","Jiageng Mao","Boyi Li","Daniel Seita","Vitor Guizilini","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2501.16411v1.pdf","comment":"ICLR 2025. Project page: https://physbench.github.io/; Dataset:\n https://huggingface.co/datasets/USC-GVL/PhysBench;"},{"id":"http://arxiv.org/abs/2501.16410v1","updated":"2025-01-27T18:57:19Z","published":"2025-01-27T18:57:19Z","title":"DynAlign: Unsupervised Dynamic Taxonomy Alignment for Cross-Domain\n Segmentation","summary":" Current unsupervised domain adaptation (UDA) methods for semantic\nsegmentation typically assume identical class labels between the source and\ntarget domains. This assumption ignores the label-level domain gap, which is\ncommon in real-world scenarios, thus limiting their ability to identify\nfiner-grained or novel categories without requiring extensive manual\nannotation. A promising direction to address this limitation lies in recent\nadvancements in foundation models, which exhibit strong generalization\nabilities due to their rich prior knowledge. However, these models often\nstruggle with domain-specific nuances and underrepresented fine-grained\ncategories.\n To address these challenges, we introduce DynAlign, a framework that\nintegrates UDA with foundation models to bridge both the image-level and\nlabel-level domain gaps. Our approach leverages prior semantic knowledge to\nalign source categories with target categories that can be novel, more\nfine-grained, or named differently (e.g., vehicle to {car, truck, bus}).\nFoundation models are then employed for precise segmentation and category\nreassignment. To further enhance accuracy, we propose a knowledge fusion\napproach that dynamically adapts to varying scene contexts. DynAlign generates\naccurate predictions in a new target label space without requiring any manual\nannotations, allowing seamless adaptation to new taxonomies through either\nmodel retraining or direct inference.\n Experiments on the street scene semantic segmentation benchmarks GTA to\nMapillary Vistas and GTA to IDD validate the effectiveness of our approach,\nachieving a significant improvement over existing methods. Our code will be\npublicly available.\n","authors":["Han Sun","Rui Gong","Ismail Nejjar","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2501.16410v1.pdf","comment":null}]},"2025-01-26T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.15659v1","updated":"2025-01-26T19:43:41Z","published":"2025-01-26T19:43:41Z","title":"AirIO: Learning Inertial Odometry with Enhanced IMU Feature\n Observability","summary":" Inertial odometry (IO) using only Inertial Measurement Units (IMUs) offers a\nlightweight and cost-effective solution for Unmanned Aerial Vehicle (UAV)\napplications, yet existing learning-based IO models often fail to generalize to\nUAVs due to the highly dynamic and non-linear-flight patterns that differ from\npedestrian motion. In this work, we identify that the conventional practice of\ntransforming raw IMU data to global coordinates undermines the observability of\ncritical kinematic information in UAVs. By preserving the body-frame\nrepresentation, our method achieves substantial performance improvements, with\na 66.7% average increase in accuracy across three datasets. Furthermore,\nexplicitly encoding attitude information into the motion network results in an\nadditional 23.8% improvement over prior results. Combined with a data-driven\nIMU correction model (AirIMU) and an uncertainty-aware Extended Kalman Filter\n(EKF), our approach ensures robust state estimation under aggressive UAV\nmaneuvers without relying on external sensors or control inputs. Notably, our\nmethod also demonstrates strong generalizability to unseen data not included in\nthe training set, underscoring its potential for real-world UAV applications.\n","authors":["Yuheng Qiu","Can Xu","Yutian Chen","Shibo Zhao","Junyi Geng","Sebastian Scherer"],"pdf_url":"https://arxiv.org/pdf/2501.15659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15618v1","updated":"2025-01-26T17:54:43Z","published":"2025-01-26T17:54:43Z","title":"Your Learned Constraint is Secretly a Backward Reachable Tube","summary":" Inverse Constraint Learning (ICL) is the problem of inferring constraints\nfrom safe (i.e., constraint-satisfying) demonstrations. The hope is that these\ninferred constraints can then be used downstream to search for safe policies\nfor new tasks and, potentially, under different dynamics. Our paper explores\nthe question of what mathematical entity ICL recovers. Somewhat surprisingly,\nwe show that both in theory and in practice, ICL recovers the set of states\nwhere failure is inevitable, rather than the set of states where failure has\nalready happened. In the language of safe control, this means we recover a\nbackwards reachable tube (BRT) rather than a failure set. In contrast to the\nfailure set, the BRT depends on the dynamics of the data collection system. We\ndiscuss the implications of the dynamics-conditionedness of the recovered\nconstraint on both the sample-efficiency of policy search and the\ntransferability of learned constraints.\n","authors":["Mohamad Qadri","Gokul Swamy","Jonathan Francis","Michael Kaess","Andrea Bajcsy"],"pdf_url":"https://arxiv.org/pdf/2501.15618v1.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.15564v1","updated":"2025-01-26T15:49:50Z","published":"2025-01-26T15:49:50Z","title":"Diffusion-Based Planning for Autonomous Driving with Flexible Guidance","summary":" Achieving human-like driving behaviors in complex open-world environments is\na critical challenge in autonomous driving. Contemporary learning-based\nplanning approaches such as imitation learning methods often struggle to\nbalance competing objectives and lack of safety assurance,due to limited\nadaptability and inadequacy in learning complex multi-modal behaviors commonly\nexhibited in human planning, not to mention their strong reliance on the\nfallback strategy with predefined rules. We propose a novel transformer-based\nDiffusion Planner for closed-loop planning, which can effectively model\nmulti-modal driving behavior and ensure trajectory quality without any\nrule-based refinement. Our model supports joint modeling of both prediction and\nplanning tasks under the same architecture, enabling cooperative behaviors\nbetween vehicles. Moreover, by learning the gradient of the trajectory score\nfunction and employing a flexible classifier guidance mechanism, Diffusion\nPlanner effectively achieves safe and adaptable planning behaviors. Evaluations\non the large-scale real-world autonomous planning benchmark nuPlan and our\nnewly collected 200-hour delivery-vehicle driving dataset demonstrate that\nDiffusion Planner achieves state-of-the-art closed-loop performance with robust\ntransferability in diverse driving styles.\n","authors":["Yinan Zheng","Ruiming Liang","Kexin Zheng","Jinliang Zheng","Liyuan Mao","Jianxiong Li","Weihao Gu","Rui Ai","Shengbo Eben Li","Xianyuan Zhan","Jingjing Liu"],"pdf_url":"https://arxiv.org/pdf/2501.15564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.16931v2","updated":"2025-01-26T15:27:59Z","published":"2024-11-25T21:00:46Z","title":"Performance Assessment of Lidar Odometry Frameworks: A Case Study at the\n Australian Botanic Garden Mount Annan","summary":" Autonomous vehicles are being tested in diverse environments worldwide.\nHowever, a notable gap exists in evaluating datasets representing natural,\nunstructured environments such as forests or gardens. To address this, we\npresent a study on localisation at the Australian Botanic Garden Mount Annan.\nThis area encompasses open grassy areas, paved pathways, and densely vegetated\nsections with trees and other objects. The dataset was recorded using a\n128-beam LiDAR sensor and GPS and IMU readings to track the ego-vehicle. This\npaper evaluates the performance of two state-of-the-art LiDARinertial odometry\nframeworks, COIN-LIO and LIO-SAM, on this dataset. We analyse trajectory\nestimates in both horizontal and vertical dimensions and assess relative\ntranslation and yaw errors over varying distances. Our findings reveal that\nwhile both frameworks perform adequately in the vertical plane, COINLIO\ndemonstrates superior accuracy in the horizontal plane, particularly over\nextended trajectories. In contrast, LIO-SAM shows increased drift and yaw\nerrors over longer distances.\n","authors":["Mohamed Mourad Ouazghire","Julie Stephany Berrio","Mao Shan","Stewart Worrall"],"pdf_url":"https://arxiv.org/pdf/2411.16931v2.pdf","comment":"The 2024 Australasian Conference on Robotics and Automation (ACRA\n 2024)"},{"id":"http://arxiv.org/abs/2311.01248v5","updated":"2025-01-26T15:03:06Z","published":"2023-11-02T14:02:42Z","title":"Multimodal and Force-Matched Imitation Learning with a See-Through\n Visuotactile Sensor","summary":" Contact-rich tasks continue to present many challenges for robotic\nmanipulation. In this work, we leverage a multimodal visuotactile sensor within\nthe framework of imitation learning (IL) to perform contact-rich tasks that\ninvolve relative motion (e.g., slipping and sliding) between the end-effector\nand the manipulated object. We introduce two algorithmic contributions, tactile\nforce matching and learned mode switching, as complimentary methods for\nimproving IL. Tactile force matching enhances kinesthetic teaching by reading\napproximate forces during the demonstration and generating an adapted robot\ntrajectory that recreates the recorded forces. Learned mode switching uses IL\nto couple visual and tactile sensor modes with the learned motion policy,\nsimplifying the transition from reaching to contacting. We perform robotic\nmanipulation experiments on four door-opening tasks with a variety of\nobservation and algorithm configurations to study the utility of multimodal\nvisuotactile sensing and our proposed improvements. Our results show that the\ninclusion of force matching raises average policy success rates by 62.5%,\nvisuotactile mode switching by 30.3%, and visuotactile data as a policy input\nby 42.5%, emphasizing the value of see-through tactile sensing for IL, both for\ndata collection to allow force matching, and for policy execution to enable\naccurate task feedback. Project site: https://papers.starslab.ca/sts-il/\n","authors":["Trevor Ablett","Oliver Limoyo","Adam Sigal","Affan Jilani","Jonathan Kelly","Kaleem Siddiqi","Francois Hogan","Gregory Dudek"],"pdf_url":"https://arxiv.org/pdf/2311.01248v5.pdf","comment":"14 pages, 22 figures"},{"id":"http://arxiv.org/abs/2501.15505v1","updated":"2025-01-26T12:34:48Z","published":"2025-01-26T12:34:48Z","title":"Unveiling the Potential of iMarkers: Invisible Fiducial Markers for\n Advanced Robotics","summary":" Fiducial markers are widely used in various robotics tasks, facilitating\nenhanced navigation, object recognition, and scene understanding. Despite their\nadvantages for robots and Augmented Reality (AR) applications, they often\ndisrupt the visual aesthetics of environments because they are visible to\nhumans, making them unsuitable for non-intrusive use cases. To address this\ngap, this paper presents \"iMarkers\"-innovative, unobtrusive fiducial markers\ndetectable exclusively by robots equipped with specialized sensors. These\nmarkers offer high flexibility in production, allowing customization of their\nvisibility range and encoding algorithms to suit various demands. The paper\nalso introduces the hardware designs and software algorithms developed for\ndetecting iMarkers, highlighting their adaptability and robustness in the\ndetection and recognition stages. Various evaluations have demonstrated the\neffectiveness of iMarkers compared to conventional (printed) and blended\nfiducial markers and confirmed their applicability in diverse robotics\nscenarios.\n","authors":["Ali Tourani","Deniz Isinsu Avsar","Hriday Bavle","Jose Luis Sanchez-Lopez","Jan Lagerwall","Holger Voos"],"pdf_url":"https://arxiv.org/pdf/2501.15505v1.pdf","comment":"12 pages, 10 figures, 2 tables"},{"id":"http://arxiv.org/abs/2405.18251v3","updated":"2025-01-26T07:43:30Z","published":"2024-05-28T15:02:09Z","title":"Sensor-Based Distributionally Robust Control for Safe Robot Navigation\n in Dynamic Environments","summary":" We introduce a novel method for mobile robot navigation in dynamic, unknown\nenvironments, leveraging onboard sensing and distributionally robust\noptimization to impose probabilistic safety constraints. Our method introduces\na distributionally robust control barrier function (DR-CBF) that directly\nintegrates noisy sensor measurements and state estimates to define safety\nconstraints. This approach is applicable to a wide range of control-affine\ndynamics, generalizable to robots with complex geometries, and capable of\noperating at real-time control frequencies. Coupled with a control Lyapunov\nfunction (CLF) for path following, the proposed CLF-DR-CBF control synthesis\nmethod achieves safe, robust, and efficient navigation in challenging\nenvironments. We demonstrate the effectiveness and robustness of our approach\nfor safe autonomous navigation under uncertainty in simulations and real-world\nexperiments with differential-drive robots.\n","authors":["Kehan Long","Yinzhuang Yi","Zhirui Dai","Sylvia Herbert","Jorge Cortés","Nikolay Atanasov"],"pdf_url":"https://arxiv.org/pdf/2405.18251v3.pdf","comment":"Project page: https://existentialrobotics.org/DRO_Safe_Navigation"},{"id":"http://arxiv.org/abs/2501.15426v1","updated":"2025-01-26T07:06:31Z","published":"2025-01-26T07:06:31Z","title":"FAVbot: An Autonomous Target Tracking Micro-Robot with Frequency\n Actuation Control","summary":" Robotic autonomy at centimeter scale requires compact and\nminiaturization-friendly actuation integrated with sensing and neural network\nprocessing assembly within a tiny form factor. Applications of such systems\nhave witnessed significant advancements in recent years in fields such as\nhealthcare, manufacturing, and post-disaster rescue. The system design at this\nscale puts stringent constraints on power consumption for both the sensory\nfront-end and actuation back-end and the weight of the electronic assembly for\nrobust operation. In this paper, we introduce FAVbot, the first autonomous\nmobile micro-robotic system integrated with a novel actuation mechanism and\nconvolutional neural network (CNN) based computer vision - all integrated\nwithin a compact 3-cm form factor. The novel actuation mechanism utilizes\nmechanical resonance phenomenon to achieve frequency-controlled steering with a\nsingle piezoelectric actuator. Experimental results demonstrate the\neffectiveness of FAVbot's frequency-controlled actuation, which offers a\ndiverse selection of resonance modes with different motion characteristics. The\nactuation system is complemented with the vision front-end where a camera along\nwith a microcontroller supports object detection for closed-loop control and\nautonomous target tracking. This enables adaptive navigation in dynamic\nenvironments. This work contributes to the evolving landscape of neural\nnetwork-enabled micro-robotic systems showing the smallest autonomous robot\nbuilt using controllable multi-directional single-actuator mechanism.\n","authors":["Zhijian Hao","Ashwin Lele","Yan Fang","Arijit Raychowdhury","Azadeh Ansari"],"pdf_url":"https://arxiv.org/pdf/2501.15426v1.pdf","comment":"This paper is under consideration for journal publication. Authors\n reserve the right to transfer copyright without notice"},{"id":"http://arxiv.org/abs/2212.00398v2","updated":"2025-01-26T02:02:40Z","published":"2022-12-01T09:59:53Z","title":"Distributed Model Predictive Covariance Steering","summary":" This paper proposes Distributed Model Predictive Covariance Steering (DiMPCS)\nfor multi-agent control under stochastic uncertainty. The scope of our approach\nis to blend covariance steering theory, distributed optimization and model\npredictive control (MPC) into a single framework that is safe, scalable and\ndecentralized. Initially, we pose a problem formulation that uses the\nWasserstein distance to steer the state distributions of a multi-agent system\nto desired targets, and probabilistic constraints to ensure safety. We then\ntransform this problem into a finite-dimensional optimization one by utilizing\na disturbance feedback policy parametrization for covariance steering and a\ntractable approximation of the safety constraints. To solve the latter problem,\nwe derive a decentralized consensus-based algorithm using the Alternating\nDirection Method of Multipliers. This method is then extended to a receding\nhorizon form, which yields the proposed DiMPCS algorithm. Simulation\nexperiments on a variety of multi-robot tasks with up to hundreds of robots\ndemonstrate the effectiveness of DiMPCS. The superior scalability and\nperformance of the proposed method is also highlighted through a comparison\nagainst related stochastic MPC approaches. Finally, hardware results on a\nmulti-robot platform also verify the applicability of DiMPCS on real systems. A\nvideo with all results is available in https://youtu.be/tzWqOzuj2kQ.\n","authors":["Augustinos D. Saravanos","Isin M. Balci","Efstathios Bakolas","Evangelos A. Theodorou"],"pdf_url":"https://arxiv.org/pdf/2212.00398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16389v1","updated":"2025-01-26T00:27:04Z","published":"2025-01-26T00:27:04Z","title":"Bridging the Sim2Real Gap: Vision Encoder Pre-Training for Visuomotor\n Policy Transfer","summary":" Simulation offers a scalable and efficient alternative to real-world data\ncollection for learning visuomotor robotic policies. However, the\nsimulation-to-reality, or \"Sim2Real\" distribution shift -- introduced by\nemploying simulation-trained policies in real-world environments -- frequently\nprevents successful policy transfer. This study explores the potential of using\nlarge-scale pre-training of vision encoders to address the Sim2Real gap. We\nexamine a diverse collection of encoders, evaluating their ability to (1)\nextract features necessary for robot control while (2) remaining invariant to\ntask-irrelevant environmental variations. We quantitatively measure the\nencoder's feature extraction capabilities through linear probing and its domain\ninvariance by computing distances between simulation and real-world embedding\ncentroids. Additional qualitative insights are provided through t-SNE plots and\nGradCAM saliency maps. Findings suggest that encoders pre-trained on\nmanipulation-specific datasets generally outperform those trained on generic\ndatasets in bridging the Sim2Real gap.\nhttps://github.com/yyardi/Bridging-the-Sim2Real-Gap\n","authors":["Samuel Biruduganti","Yash Yardi","Lars Ankile"],"pdf_url":"https://arxiv.org/pdf/2501.16389v1.pdf","comment":"9 pages, 10 figures, view GitHub for all appendix figures from the\n study"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2412.20631v2","updated":"2025-01-26T23:16:36Z","published":"2024-12-30T00:40:35Z","title":"Slow Perception: Let's Perceive Geometric Figures Step-by-step","summary":" Recently, \"visual o1\" began to enter people's vision, with expectations that\nthis slow-thinking design can solve visual reasoning tasks, especially\ngeometric math problems. However, the reality is that current LVLMs (Large\nVision Language Models) can hardly even accurately copy a geometric figure, let\nalone truly understand the complex inherent logic and spatial relationships\nwithin geometric shapes. We believe accurate copying (strong perception) is the\nfirst step to visual o1. Accordingly, we introduce the concept of \"slow\nperception\" (SP), which guides the model to gradually perceive basic point-line\ncombinations, as our humans, reconstruct complex geometric structures\nprogressively. There are two-fold stages in SP: a) perception decomposition.\nPerception is not instantaneous. In this stage, complex geometric figures are\nbroken down into basic simple units to unify geometry representation. b)\nperception flow, which acknowledges that accurately tracing a line is not an\neasy task. This stage aims to avoid \"long visual jumps\" in regressing line\nsegments by using a proposed \"perceptual ruler\" to trace each line\nstroke-by-stroke. Surprisingly, such a human-like perception manner enjoys an\ninference time scaling law -- the slower, the better. Researchers strive to\nspeed up the model's perception in the past, but we slow it down again,\nallowing the model to read the image step-by-step and carefully.\n","authors":["Haoran Wei","Youyang Yin","Yumeng Li","Jia Wang","Liang Zhao","Jianjian Sun","Zheng Ge","Xiangyu Zhang","Daxin Jiang"],"pdf_url":"https://arxiv.org/pdf/2412.20631v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10615v2","updated":"2025-01-26T21:43:53Z","published":"2025-01-18T00:35:41Z","title":"Hierarchical LoG Bayesian Neural Network for Enhanced Aorta Segmentation","summary":" Accurate segmentation of the aorta and its associated arch branches is\ncrucial for diagnosing aortic diseases. While deep learning techniques have\nsignificantly improved aorta segmentation, they remain challenging due to the\nintricate multiscale structure and the complexity of the surrounding tissues.\nThis paper presents a novel approach for enhancing aorta segmentation using a\nBayesian neural network-based hierarchical Laplacian of Gaussian (LoG) model.\nOur model consists of a 3D U-Net stream and a hierarchical LoG stream: the\nformer provides an initial aorta segmentation, and the latter enhances blood\nvessel detection across varying scales by learning suitable LoG kernels,\nenabling self-adaptive handling of different parts of the aorta vessels with\nsignificant scale differences. We employ a Bayesian method to parameterize the\nLoG stream and provide confidence intervals for the segmentation results,\nensuring robustness and reliability of the prediction for vascular medical\nimage analysts. Experimental results show that our model can accurately segment\nmain and supra-aortic vessels, yielding at least a 3% gain in the Dice\ncoefficient over state-of-the-art methods across multiple volumes drawn from\ntwo aorta datasets, and can provide reliable confidence intervals for different\nparts of the aorta. The code is available at https://github.com/adlsn/LoGBNet.\n","authors":["Delin An","Pan Du","Pengfei Gu","Jian-Xun Wang","Chaoli Wang"],"pdf_url":"https://arxiv.org/pdf/2501.10615v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03017v2","updated":"2025-01-26T21:18:41Z","published":"2023-11-06T10:49:12Z","title":"COLA: COarse-LAbel multi-source LiDAR semantic segmentation for\n autonomous driving","summary":" LiDAR semantic segmentation for autonomous driving has been a growing field\nof interest in recent years. Datasets and methods have appeared and expanded\nvery quickly, but methods have not been updated to exploit this new data\navailability and rely on the same classical datasets. Different ways of\nperforming LIDAR semantic segmentation training and inference can be divided\ninto several subfields, which include the following: domain generalization,\nsource-to-source segmentation, and pre-training. In this work, we aim to\nimprove results in all of these subfields with the novel approach of\nmulti-source training. Multi-source training relies on the availability of\nvarious datasets at training time. To overcome the common obstacles in\nmulti-source training, we introduce the coarse labels and call the newly\ncreated multi-source dataset COLA. We propose three applications of this new\ndataset that display systematic improvement over single-source strategies:\nCOLA-DG for domain generalization (+10%), COLA-S2S for source-to-source\nsegmentation (+5.3%), and COLA-PT for pre-training (+12%). We demonstrate that\nmulti-source approaches bring systematic improvement over single-source\napproaches.\n","authors":["Jules Sanchez","Jean-Emmanuel Deschaud","François Goulette"],"pdf_url":"https://arxiv.org/pdf/2311.03017v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15666v1","updated":"2025-01-26T20:23:44Z","published":"2025-01-26T20:23:44Z","title":"MimicGait: A Model Agnostic approach for Occluded Gait Recognition using\n Correlational Knowledge Distillation","summary":" Gait recognition is an important biometric technique over large distances.\nState-of-the-art gait recognition systems perform very well in controlled\nenvironments at close range. Recently, there has been an increased interest in\ngait recognition in the wild prompted by the collection of outdoor, more\nchallenging datasets containing variations in terms of illumination, pitch\nangles, and distances. An important problem in these environments is that of\nocclusion, where the subject is partially blocked from camera view. While\nimportant, this problem has received little attention. Thus, we propose\nMimicGait, a model-agnostic approach for gait recognition in the presence of\nocclusions. We train the network using a multi-instance correlational\ndistillation loss to capture both inter-sequence and intra-sequence\ncorrelations in the occluded gait patterns of a subject, utilizing an auxiliary\nVisibility Estimation Network to guide the training of the proposed mimic\nnetwork. We demonstrate the effectiveness of our approach on challenging\nreal-world datasets like GREW, Gait3D and BRIAR. We release the code in\nhttps://github.com/Ayush-00/mimicgait.\n","authors":["Ayush Gupta","Rama Chellappa"],"pdf_url":"https://arxiv.org/pdf/2501.15666v1.pdf","comment":"Accepted to WACV 2025 as Poster"},{"id":"http://arxiv.org/abs/2501.15660v1","updated":"2025-01-26T19:46:49Z","published":"2025-01-26T19:46:49Z","title":"Marker Track: Accurate Fiducial Marker Tracking for Evaluation of\n Residual Motions During Breath-Hold Radiotherapy","summary":" Fiducial marker positions in projection image of cone-beam computed\ntomography (CBCT) scans have been studied to evaluate daily residual motion\nduring breath-hold radiation therapy. Fiducial marker migration posed\nchallenges in accurately locating markers, prompting the development of a novel\nalgorithm that reconstructs volumetric probability maps of marker locations\nfrom filtered gradient maps of projections. This guides the development of a\nPython-based algorithm to detect fiducial markers in projection images using\nMeta AI's Segment Anything Model 2 (SAM 2). Retrospective data from a\npancreatic cancer patient with two fiducial markers were analyzed. The\nthree-dimensional (3D) marker positions from simulation computed tomography\n(CT) were compared to those reconstructed from CBCT images, revealing a\ndecrease in relative distances between markers over time. Fiducial markers were\nsuccessfully detected in 2777 out of 2786 projection frames. The average\nstandard deviation of superior-inferior (SI) marker positions was 0.56 mm per\nbreath-hold, with differences in average SI positions between two breath-holds\nin the same scan reaching up to 5.2 mm, and a gap of up to 7.3 mm between the\nend of the first and beginning of the second breath-hold. 3D marker positions\nwere calculated using projection positions and confirmed marker migration. This\nmethod effectively calculates marker probability volume and enables accurate\nfiducial marker tracking during treatment without requiring any specialized\nequipment, additional radiation doses, or manual initialization and labeling.\nIt has significant potential for automatically assessing daily residual motion\nto adjust planning margins, functioning as an adaptive radiation therapy tool.\n","authors":["Aimee Guo","Weihua Mao"],"pdf_url":"https://arxiv.org/pdf/2501.15660v1.pdf","comment":"14 pages, 9 figures, Regeneron STS 2025 project. Project page:\n https://sites.google.com/view/markertrack?usp=sharing"},{"id":"http://arxiv.org/abs/2501.15659v1","updated":"2025-01-26T19:43:41Z","published":"2025-01-26T19:43:41Z","title":"AirIO: Learning Inertial Odometry with Enhanced IMU Feature\n Observability","summary":" Inertial odometry (IO) using only Inertial Measurement Units (IMUs) offers a\nlightweight and cost-effective solution for Unmanned Aerial Vehicle (UAV)\napplications, yet existing learning-based IO models often fail to generalize to\nUAVs due to the highly dynamic and non-linear-flight patterns that differ from\npedestrian motion. In this work, we identify that the conventional practice of\ntransforming raw IMU data to global coordinates undermines the observability of\ncritical kinematic information in UAVs. By preserving the body-frame\nrepresentation, our method achieves substantial performance improvements, with\na 66.7% average increase in accuracy across three datasets. Furthermore,\nexplicitly encoding attitude information into the motion network results in an\nadditional 23.8% improvement over prior results. Combined with a data-driven\nIMU correction model (AirIMU) and an uncertainty-aware Extended Kalman Filter\n(EKF), our approach ensures robust state estimation under aggressive UAV\nmaneuvers without relying on external sensors or control inputs. Notably, our\nmethod also demonstrates strong generalizability to unseen data not included in\nthe training set, underscoring its potential for real-world UAV applications.\n","authors":["Yuheng Qiu","Can Xu","Yutian Chen","Shibo Zhao","Junyi Geng","Sebastian Scherer"],"pdf_url":"https://arxiv.org/pdf/2501.15659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15656v1","updated":"2025-01-26T19:35:46Z","published":"2025-01-26T19:35:46Z","title":"Classifying Deepfakes Using Swin Transformers","summary":" The proliferation of deepfake technology poses significant challenges to the\nauthenticity and trustworthiness of digital media, necessitating the\ndevelopment of robust detection methods. This study explores the application of\nSwin Transformers, a state-of-the-art architecture leveraging shifted windows\nfor self-attention, in detecting and classifying deepfake images. Using the\nReal and Fake Face Detection dataset by Yonsei University's Computational\nIntelligence Photography Lab, we evaluate the Swin Transformer and hybrid\nmodels such as Swin-ResNet and Swin-KNN, focusing on their ability to identify\nsubtle manipulation artifacts. Our results demonstrate that the Swin\nTransformer outperforms conventional CNN-based architectures, including VGG16,\nResNet18, and AlexNet, achieving a test accuracy of 71.29\\%. Additionally, we\npresent insights into hybrid model design, highlighting the complementary\nstrengths of transformer and CNN-based approaches in deepfake detection. This\nstudy underscores the potential of transformer-based architectures for\nimproving accuracy and generalizability in image-based manipulation detection,\npaving the way for more effective countermeasures against deepfake threats.\n","authors":["Aprille J. Xi","Eason Chen"],"pdf_url":"https://arxiv.org/pdf/2501.15656v1.pdf","comment":"3 pages"},{"id":"http://arxiv.org/abs/2501.15653v1","updated":"2025-01-26T19:29:49Z","published":"2025-01-26T19:29:49Z","title":"A Privacy Enhancing Technique to Evade Detection by Street Video Cameras\n Without Using Adversarial Accessories","summary":" In this paper, we propose a privacy-enhancing technique leveraging an\ninherent property of automatic pedestrian detection algorithms, namely, that\nthe training of deep neural network (DNN) based methods is generally performed\nusing curated datasets and laboratory settings, while the operational areas of\nthese methods are dynamic real-world environments. In particular, we leverage a\nnovel side effect of this gap between the laboratory and the real world:\nlocation-based weakness in pedestrian detection. We demonstrate that the\nposition (distance, angle, height) of a person, and ambient light level,\ndirectly impact the confidence of a pedestrian detector when detecting the\nperson. We then demonstrate that this phenomenon is present in pedestrian\ndetectors observing a stationary scene of pedestrian traffic, with blind spot\nareas of weak detection of pedestrians with low confidence. We show how\nprivacy-concerned pedestrians can leverage these blind spots to evade detection\nby constructing a minimum confidence path between two points in a scene,\nreducing the maximum confidence and average confidence of the path by up to\n0.09 and 0.13, respectively, over direct and random paths through the scene. To\ncounter this phenomenon, and force the use of more costly and sophisticated\nmethods to leverage this vulnerability, we propose a novel countermeasure to\nimprove the confidence of pedestrian detectors in blind spots, raising the\nmax/average confidence of paths generated by our technique by 0.09 and 0.05,\nrespectively. In addition, we demonstrate that our countermeasure improves a\nFaster R-CNN-based pedestrian detector's TPR and average true positive\nconfidence by 0.03 and 0.15, respectively.\n","authors":["Jacob Shams","Ben Nassi","Satoru Koda","Asaf Shabtai","Yuval Elovici"],"pdf_url":"https://arxiv.org/pdf/2501.15653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15648v1","updated":"2025-01-26T19:17:05Z","published":"2025-01-26T19:17:05Z","title":"Can Pose Transfer Models Generate Realistic Human Motion?","summary":" Recent pose-transfer methods aim to generate temporally consistent and fully\ncontrollable videos of human action where the motion from a reference video is\nreenacted by a new identity. We evaluate three state-of-the-art pose-transfer\nmethods -- AnimateAnyone, MagicAnimate, and ExAvatar -- by generating videos\nwith actions and identities outside the training distribution and conducting a\nparticipant study about the quality of these videos. In a controlled\nenvironment of 20 distinct human actions, we find that participants, presented\nwith the pose-transferred videos, correctly identify the desired action only\n42.92% of the time. Moreover, the participants find the actions in the\ngenerated videos consistent with the reference (source) videos only 36.46% of\nthe time. These results vary by method: participants find the splatting-based\nExAvatar more consistent and photorealistic than the diffusion-based\nAnimateAnyone and MagicAnimate.\n","authors":["Vaclav Knapp","Matyas Bohacek"],"pdf_url":"https://arxiv.org/pdf/2501.15648v1.pdf","comment":"Data and code available at\n https://github.com/matyasbohacek/pose-transfer-human-motion"},{"id":"http://arxiv.org/abs/2409.14677v2","updated":"2025-01-26T19:07:42Z","published":"2024-09-23T02:59:07Z","title":"Reflecting Reality: Enabling Diffusion Models to Produce Faithful Mirror\n Reflections","summary":" We tackle the problem of generating highly realistic and plausible mirror\nreflections using diffusion-based generative models. We formulate this problem\nas an image inpainting task, allowing for more user control over the placement\nof mirrors during the generation process. To enable this, we create SynMirror,\na large-scale dataset of diverse synthetic scenes with objects placed in front\nof mirrors. SynMirror contains around 198k samples rendered from 66k unique 3D\nobjects, along with their associated depth maps, normal maps and instance-wise\nsegmentation masks, to capture relevant geometric properties of the scene.\nUsing this dataset, we propose a novel depth-conditioned inpainting method\ncalled MirrorFusion, which generates high-quality, realistic, shape and\nappearance-aware reflections of real-world objects. MirrorFusion outperforms\nstate-of-the-art methods on SynMirror, as demonstrated by extensive\nquantitative and qualitative analysis. To the best of our knowledge, we are the\nfirst to successfully tackle the challenging problem of generating controlled\nand faithful mirror reflections of an object in a scene using diffusion-based\nmodels. SynMirror and MirrorFusion open up new avenues for image editing and\naugmented reality applications for practitioners and researchers alike. The\nproject page is available at:\nhttps://val.cds.iisc.ac.in/reflecting-reality.github.io/.\n","authors":["Ankit Dhiman","Manan Shah","Rishubh Parihar","Yash Bhalgat","Lokesh R Boregowda","R Venkatesh Babu"],"pdf_url":"https://arxiv.org/pdf/2409.14677v2.pdf","comment":"Accepted to 3DV 2025. First two authors contributed equally. Project\n Page: https://val.cds.iisc.ac.in/reflecting-reality.github.io/"},{"id":"http://arxiv.org/abs/2501.15641v1","updated":"2025-01-26T19:01:19Z","published":"2025-01-26T19:01:19Z","title":"Bringing Characters to New Stories: Training-Free Theme-Specific Image\n Generation via Dynamic Visual Prompting","summary":" The stories and characters that captivate us as we grow up shape unique\nfantasy worlds, with images serving as the primary medium for visually\nexperiencing these realms. Personalizing generative models through fine-tuning\nwith theme-specific data has become a prevalent approach in text-to-image\ngeneration. However, unlike object customization, which focuses on learning\nspecific objects, theme-specific generation encompasses diverse elements such\nas characters, scenes, and objects. Such diversity also introduces a key\nchallenge: how to adaptively generate multi-character, multi-concept, and\ncontinuous theme-specific images (TSI). Moreover, fine-tuning approaches often\ncome with significant computational overhead, time costs, and risks of\noverfitting. This paper explores a fundamental question: Can image generation\nmodels directly leverage images as contextual input, similarly to how large\nlanguage models use text as context? To address this, we present T-Prompter, a\nnovel training-free TSI method for generation. T-Prompter introduces visual\nprompting, a mechanism that integrates reference images into generative models,\nallowing users to seamlessly specify the target theme without requiring\nadditional training. To further enhance this process, we propose a Dynamic\nVisual Prompting (DVP) mechanism, which iteratively optimizes visual prompts to\nimprove the accuracy and quality of generated images. Our approach enables\ndiverse applications, including consistent story generation, character design,\nrealistic character generation, and style-guided image generation. Comparative\nevaluations against state-of-the-art personalization methods demonstrate that\nT-Prompter achieves significantly better results and excels in maintaining\ncharacter identity preserving, style consistency and text alignment, offering a\nrobust and flexible solution for theme-specific image generation.\n","authors":["Yuxin Zhang","Minyan Luo","Weiming Dong","Xiao Yang","Haibin Huang","Chongyang Ma","Oliver Deussen","Tong-Yee Lee","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2501.15641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15619v1","updated":"2025-01-26T17:56:11Z","published":"2025-01-26T17:56:11Z","title":"GaussianToken: An Effective Image Tokenizer with 2D Gaussian Splatting","summary":" Effective image tokenization is crucial for both multi-modal understanding\nand generation tasks due to the necessity of the alignment with discrete text\ndata. To this end, existing approaches utilize vector quantization (VQ) to\nproject pixels onto a discrete codebook and reconstruct images from the\ndiscrete representation. However, compared with the continuous latent space,\nthe limited discrete codebook space significantly restrict the representational\nability of these image tokenizers. In this paper, we propose GaussianToken: An\nEffective Image Tokenizer with 2D Gaussian Splatting as a solution. We first\nrepresent the encoded samples as multiple flexible featured 2D Gaussians\ncharacterized by positions, rotation angles, scaling factors, and feature\ncoefficients. We adopt the standard quantization for the Gaussian features and\nthen concatenate the quantization results with the other intrinsic Gaussian\nparameters before the corresponding splatting operation and the subsequent\ndecoding module. In general, GaussianToken integrates the local influence of 2D\nGaussian distribution into the discrete space and thus enhances the\nrepresentation capability of the image tokenizer. Competitive reconstruction\nperformances on CIFAR, Mini-ImageNet, and ImageNet-1K demonstrate the\neffectiveness of our framework. Our code is available at:\nhttps://github.com/ChrisDong-THU/GaussianToken.\n","authors":["Jiajun Dong","Chengkun Wang","Wenzhao Zheng","Lei Chen","Jiwen Lu","Yansong Tang"],"pdf_url":"https://arxiv.org/pdf/2501.15619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15616v1","updated":"2025-01-26T17:51:03Z","published":"2025-01-26T17:51:03Z","title":"IPVTON: Image-based 3D Virtual Try-on with Image Prompt Adapter","summary":" Given a pair of images depicting a person and a garment separately,\nimage-based 3D virtual try-on methods aim to reconstruct a 3D human model that\nrealistically portrays the person wearing the desired garment. In this paper,\nwe present IPVTON, a novel image-based 3D virtual try-on framework. IPVTON\nemploys score distillation sampling with image prompts to optimize a hybrid 3D\nhuman representation, integrating target garment features into diffusion priors\nthrough an image prompt adapter. To avoid interference with non-target areas,\nwe leverage mask-guided image prompt embeddings to focus the image features on\nthe try-on regions. Moreover, we impose geometric constraints on the 3D model\nwith a pseudo silhouette generated by ControlNet, ensuring that the clothed 3D\nhuman model retains the shape of the source identity while accurately wearing\nthe target garments. Extensive qualitative and quantitative experiments\ndemonstrate that IPVTON outperforms previous methods in image-based 3D virtual\ntry-on tasks, excelling in both geometry and texture.\n","authors":["Xiaojing Zhong","Zhonghua Wu","Xiaofeng Yang","Guosheng Lin","Qingyao Wu"],"pdf_url":"https://arxiv.org/pdf/2501.15616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10744v2","updated":"2025-01-26T17:35:40Z","published":"2024-10-14T17:22:12Z","title":"Adversarially Robust Out-of-Distribution Detection Using\n Lyapunov-Stabilized Embeddings","summary":" Despite significant advancements in out-of-distribution (OOD) detection,\nexisting methods still struggle to maintain robustness against adversarial\nattacks, compromising their reliability in critical real-world applications.\nPrevious studies have attempted to address this challenge by exposing detectors\nto auxiliary OOD datasets alongside adversarial training. However, the\nincreased data complexity inherent in adversarial training, and the myriad of\nways that OOD samples can arise during testing, often prevent these approaches\nfrom establishing robust decision boundaries. To address these limitations, we\npropose AROS, a novel approach leveraging neural ordinary differential\nequations (NODEs) with Lyapunov stability theorem in order to obtain robust\nembeddings for OOD detection. By incorporating a tailored loss function, we\napply Lyapunov stability theory to ensure that both in-distribution (ID) and\nOOD data converge to stable equilibrium points within the dynamical system.\nThis approach encourages any perturbed input to return to its stable\nequilibrium, thereby enhancing the model's robustness against adversarial\nperturbations. To not use additional data, we generate fake OOD embeddings by\nsampling from low-likelihood regions of the ID data feature space,\napproximating the boundaries where OOD data are likely to reside. To then\nfurther enhance robustness, we propose the use of an orthogonal binary layer\nfollowing the stable feature space, which maximizes the separation between the\nequilibrium points of ID and OOD samples. We validate our method through\nextensive experiments across several benchmarks, demonstrating superior\nperformance, particularly under adversarial attacks. Notably, our approach\nimproves robust detection performance from 37.8% to 80.1% on CIFAR-10 vs.\nCIFAR-100 and from 29.0% to 67.0% on CIFAR-100 vs. CIFAR-10.\n","authors":["Hossein Mirzaei","Mackenzie W. Mathis"],"pdf_url":"https://arxiv.org/pdf/2410.10744v2.pdf","comment":"Accepted at the International Conference on Learning Representations\n (ICLR) 2025. Code and pre-trained models are available at\n https://github.com/AdaptiveMotorControlLab/AROS"},{"id":"http://arxiv.org/abs/2501.15610v1","updated":"2025-01-26T17:32:58Z","published":"2025-01-26T17:32:58Z","title":"Radiologist-in-the-Loop Self-Training for Generalizable CT Metal\n Artifact Reduction","summary":" Metal artifacts in computed tomography (CT) images can significantly degrade\nimage quality and impede accurate diagnosis. Supervised metal artifact\nreduction (MAR) methods, trained using simulated datasets, often struggle to\nperform well on real clinical CT images due to a substantial domain gap.\nAlthough state-of-the-art semi-supervised methods use pseudo ground-truths\ngenerated by a prior network to mitigate this issue, their reliance on a fixed\nprior limits both the quality and quantity of these pseudo ground-truths,\nintroducing confirmation bias and reducing clinical applicability. To address\nthese limitations, we propose a novel Radiologist-In-the-loop SElf-training\nframework for MAR, termed RISE-MAR, which can integrate radiologists' feedback\ninto the semi-supervised learning process, progressively improving the quality\nand quantity of pseudo ground-truths for enhanced generalization on real\nclinical CT images. For quality assurance, we introduce a clinical quality\nassessor model that emulates radiologist evaluations, effectively selecting\nhigh-quality pseudo ground-truths for semi-supervised training. For quantity\nassurance, our self-training framework iteratively generates additional\nhigh-quality pseudo ground-truths, expanding the clinical dataset and further\nimproving model generalization. Extensive experimental results on multiple\nclinical datasets demonstrate the superior generalization performance of our\nRISE-MAR over state-of-the-art methods, advancing the development of MAR models\nfor practical application. Code is available at\nhttps://github.com/Masaaki-75/rise-mar.\n","authors":["Chenglong Ma","Zilong Li","Yuanlin Li","Jing Han","Junping Zhang","Yi Zhang","Jiannan Liu","Hongming Shan"],"pdf_url":"https://arxiv.org/pdf/2501.15610v1.pdf","comment":"IEEE TMI 2025"},{"id":"http://arxiv.org/abs/2501.15603v1","updated":"2025-01-26T17:05:37Z","published":"2025-01-26T17:05:37Z","title":"Advancing TDFN: Precise Fixation Point Generation Using Reconstruction\n Differences","summary":" Wang and Wang (2025) proposed the Task-Driven Fixation Network (TDFN) based\non the fixation mechanism, which leverages low-resolution information along\nwith high-resolution details near fixation points to accomplish specific visual\ntasks. The model employs reinforcement learning to generate fixation points.\nHowever, training reinforcement learning models is challenging, particularly\nwhen aiming to generate pixel-level accurate fixation points on high-resolution\nimages. This paper introduces an improved fixation point generation method by\nleveraging the difference between the reconstructed image and the input image\nto train the fixation point generator. This approach directs fixation points to\nareas with significant differences between the reconstructed and input images.\nExperimental results demonstrate that this method achieves highly accurate\nfixation points, significantly enhances the network's classification accuracy,\nand reduces the average number of required fixations to achieve a predefined\naccuracy level.\n","authors":["Shuguang Wang","Yuanjing Wang"],"pdf_url":"https://arxiv.org/pdf/2501.15603v1.pdf","comment":"9 pages, 5 figures, 2 tables"}]},"2025-01-25T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2410.14164v2","updated":"2025-01-25T22:27:30Z","published":"2024-10-18T04:04:58Z","title":"Optimal DLT-based Solutions for the Perspective-n-Point","summary":" We propose a modified normalized direct linear transform (DLT) algorithm for\nsolving the perspective-n-point (PnP) problem with much better behavior than\nthe conventional DLT. The modification consists of analytically weighting the\ndifferent measurements in the linear system with a negligible increase in\ncomputational load. Our approach exhibits clear improvements -- in both\nperformance and runtime -- when compared to popular methods such as EPnP, CPnP,\nRPnP, and OPnP. Our new non-iterative solution approaches that of the true\noptimal found via Gauss-Newton optimization, but at a fraction of the\ncomputational cost. Our optimal DLT (oDLT) implementation, as well as the\nexperiments, are released in open source.\n","authors":["Sébastien Henry","John A. Christian"],"pdf_url":"https://arxiv.org/pdf/2410.14164v2.pdf","comment":"8 pages, 6 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.15272v1","updated":"2025-01-25T16:49:54Z","published":"2025-01-25T16:49:54Z","title":"Safe and Agile Transportation of Cable-Suspended Payload via Multiple\n Aerial Robots","summary":" Transporting a heavy payload using multiple aerial robots (MARs) is an\nefficient manner to extend the load capacity of a single aerial robot. However,\nexisting schemes for the multiple aerial robots transportation system (MARTS)\nstill lack the capability to generate a collision-free and dynamically feasible\ntrajectory in real-time and further track an agile trajectory especially when\nthere are no sensors available to measure the states of payload and cable.\nTherefore, they are limited to low-agility transportation in simple\nenvironments. To bridge the gap, we propose complete planning and control\nschemes for the MARTS, achieving safe and agile aerial transportation (SAAT) of\na cable-suspended payload in complex environments. Flatness maps for the aerial\nrobot considering the complete kinematical constraint and the dynamical\ncoupling between each aerial robot and payload are derived. To improve the\nresponsiveness for the generation of the safe, dynamically feasible, and agile\ntrajectory in complex environments, a real-time spatio-temporal trajectory\nplanning scheme is proposed for the MARTS. Besides, we break away from the\nreliance on the state measurement for both the payload and cable, as well as\nthe closed-loop control for the payload, and propose a fully distributed\ncontrol scheme to track the agile trajectory that is robust against imprecise\npayload mass and non-point mass payload. The proposed schemes are extensively\nvalidated through benchmark comparisons, ablation studies, and simulations.\nFinally, extensive real-world experiments are conducted on a MARTS integrated\nby three aerial robots with onboard computers and sensors. The result validates\nthe efficiency and robustness of our proposed schemes for SAAT in complex\nenvironments.\n","authors":["Yongchao Wang","Junjie Wang","Xiaobin Zhou","Tiankai Yang","Chao Xu","Fei Gao"],"pdf_url":"https://arxiv.org/pdf/2501.15272v1.pdf","comment":"20 pages, 14 figures, submitted to IEEE Transactions on Robotics"},{"id":"http://arxiv.org/abs/2501.15214v1","updated":"2025-01-25T13:33:22Z","published":"2025-01-25T13:33:22Z","title":"Zero-shot Robotic Manipulation with Language-guided Instruction and\n Formal Task Planning","summary":" Robotic manipulation is often challenging due to the long-horizon tasks and\nthe complex object relationships. A common solution is to develop a task and\nmotion planning framework that integrates planning for high-level task and\nlow-level motion. Recently, inspired by the powerful reasoning ability of Large\nLanguage Models (LLMs), LLM-based planning approaches have achieved remarkable\nprogress. However, these methods still heavily rely on expert-specific\nknowledge, often generating invalid plans for unseen and unfamiliar tasks. To\naddress this issue, we propose an innovative language-guided symbolic task\nplanning (LM-SymOpt) framework with optimization. It is the first expert-free\nplanning framework since we combine the world knowledge from LLMs with formal\nreasoning, resulting in improved generalization capability to new tasks.\nSpecifically, differ to most existing work, our LM-SymOpt employs LLMs to\ntranslate natural language instructions into symbolic representations, thereby\nrepresenting actions as high-level symbols and reducing the search space for\nplanning. Next, after evaluating the action probability of completing the task\nusing LLMs, a weighted random sampling method is introduced to generate\ncandidate plans. Their feasibility is assessed through symbolic reasoning and\ntheir cost efficiency is then evaluated using trajectory optimization for\nselecting the optimal planning. Our experimental results show that LM-SymOpt\noutperforms existing LLM-based planning approaches.\n","authors":["Junfeng Tang","Zihan Ye","Yuping Yan","Ziqi Zheng","Ting Gao","Yaochu Jin"],"pdf_url":"https://arxiv.org/pdf/2501.15214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15198v1","updated":"2025-01-25T12:32:52Z","published":"2025-01-25T12:32:52Z","title":"Towards Conscious Service Robots","summary":" Deep learning's success in perception, natural language processing, etc.\ninspires hopes for advancements in autonomous robotics. However, real-world\nrobotics face challenges like variability, high-dimensional state spaces,\nnon-linear dependencies, and partial observability. A key issue is\nnon-stationarity of robots, environments, and tasks, leading to performance\ndrops with out-of-distribution data. Unlike current machine learning models,\nhumans adapt quickly to changes and new tasks due to a cognitive architecture\nthat enables systematic generalization and meta-cognition. Human brain's System\n1 handles routine tasks unconsciously, while System 2 manages complex tasks\nconsciously, facilitating flexible problem-solving and self-monitoring. For\nrobots to achieve human-like learning and reasoning, they need to integrate\ncausal models, working memory, planning, and metacognitive processing. By\nincorporating human cognition insights, the next generation of service robots\nwill handle novel situations and monitor themselves to avoid risks and mitigate\nerrors.\n","authors":["Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2501.15198v1.pdf","comment":"In: Science for a Better Tomorrow: Curious 2024 Insights Actions,\n Springer 2025"},{"id":"http://arxiv.org/abs/2501.15189v1","updated":"2025-01-25T12:01:56Z","published":"2025-01-25T12:01:56Z","title":"Extracting Forward Invariant Sets from Neural Network-Based Control\n Barrier Functions","summary":" Training Neural Networks (NNs) to serve as Barrier Functions (BFs) is a\npopular way to improve the safety of autonomous dynamical systems. Despite\nsignificant practical success, these methods are not generally guaranteed to\nproduce true BFs in a provable sense, which undermines their intended use as\nsafety certificates. In this paper, we consider the problem of formally\ncertifying a learned NN as a BF with respect to state avoidance for an\nautonomous system: viz. computing a region of the state space on which the\ncandidate NN is provably a BF. In particular, we propose a sound algorithm that\nefficiently produces such a certificate set for a shallow NN. Our algorithm\ncombines two novel approaches: it first uses NN reachability tools to identify\na subset of states for which the output of the NN does not increase along\nsystem trajectories; then, it uses a novel enumeration algorithm for hyperplane\narrangements to find the intersection of the NN's zero-sub-level set with the\nfirst set of states. In this way, our algorithm soundly finds a subset of\nstates on which the NN is certified as a BF. We further demonstrate the\neffectiveness of our algorithm at certifying for real-world NNs as BFs in two\ncase studies. We complemented these with scalability experiments that\ndemonstrate the efficiency of our algorithm.\n","authors":["Goli Vaisi","James Ferlez","Yasser Shoukry"],"pdf_url":"https://arxiv.org/pdf/2501.15189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09905v3","updated":"2025-01-25T10:43:04Z","published":"2025-01-17T01:32:18Z","title":"SLIM: Sim-to-Real Legged Instructive Manipulation via Long-Horizon\n Visuomotor Learning","summary":" We present a low-cost legged mobile manipulation system that solves\nlong-horizon real-world tasks, trained by reinforcement learning purely in\nsimulation. This system is made possible by 1) a hierarchical design of a\nhigh-level policy for visual-mobile manipulation following instructions and a\nlow-level policy for quadruped movement and limb control, 2) a progressive\nexploration and learning approach that leverages privileged task decomposition\ninformation to train the teacher policy for long-horizon tasks, which will\nguide an imitation-based student policy for efficient training of the\nhigh-level visuomotor policy, and 3) a suite of techniques for minimizing\nsim-to-real gaps.\n In contrast to previous approaches that use high-end equipment, our system\ndemonstrates effective performance with more accessible hardware -\nspecifically, a Unitree Go1 quadruped, a WidowX250S arm, and a single\nwrist-mounted RGB camera - despite the increased challenges of sim-to-real\ntransfer. When fully trained in simulation, a single policy autonomously solves\nlong-horizon tasks such as search, move, grasp, and drop-into, achieving nearly\n80% success. This performance is comparable to that of expert human\nteleoperation on the same tasks but significantly more efficient, operating at\nabout 1.5x the speed. The sim-to-real transfer is fluid across diverse indoor\nand outdoor scenes under varying lighting conditions. Finally, we discuss the\nkey techniques that enable the entire pipeline, including efficient RL training\nand sim-to-real, to work effectively for legged mobile manipulation, and\npresent their ablation results.\n","authors":["Haichao Zhang","Haonan Yu","Le Zhao","Andrew Choi","Qinxun Bai","Break Yang","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2501.09905v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15078v1","updated":"2025-01-25T05:13:41Z","published":"2025-01-25T05:13:41Z","title":"Impact-resistant, autonomous robots inspired by tensegrity architecture","summary":" Future robots will navigate perilous, remote environments with resilience and\nautonomy. Researchers have proposed building robots with compliant bodies to\nenhance robustness, but this approach often sacrifices the autonomous\ncapabilities expected of rigid robots. Inspired by tensegrity architecture, we\nintroduce a tensegrity robot -- a hybrid robot made from rigid struts and\nelastic tendons -- that demonstrates the advantages of compliance and the\nautonomy necessary for task performance. This robot boasts impact resistance\nand autonomy in a field environment and additional advances in the state of the\nart, including surviving harsh impacts from drops (at least 5.7 m), accurately\nreconstructing its shape and orientation using on-board sensors, achieving high\nlocomotion speeds (18 bar lengths per minute), and climbing the steepest\nincline of any tensegrity robot (28 degrees). We characterize the robot's\nlocomotion on unstructured terrain, showcase its autonomous capabilities in\nnavigation tasks, and demonstrate its robustness by rolling it off a cliff.\n","authors":["William R. Johnson III","Xiaonan Huang","Shiyang Lu","Kun Wang","Joran W. Booth","Kostas Bekris","Rebecca Kramer-Bottiglio"],"pdf_url":"https://arxiv.org/pdf/2501.15078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15071v1","updated":"2025-01-25T04:33:43Z","published":"2025-01-25T04:33:43Z","title":"Understanding via Gaze: Gaze-based Task Decomposition for Imitation\n Learning of Robot Manipulation","summary":" In imitation learning for robotic manipulation, decomposing object\nmanipulation tasks into multiple semantic actions is essential. This\ndecomposition enables the reuse of learned skills in varying contexts and the\ncombination of acquired skills to perform novel tasks, rather than merely\nreplicating demonstrated motions. Gaze, an evolutionary tool for understanding\nongoing events, plays a critical role in human object manipulation, where it\nstrongly correlates with motion planning. In this study, we propose a simple\nyet robust task decomposition method based on gaze transitions. We hypothesize\nthat an imitation agent's gaze control, fixating on specific landmarks and\ntransitioning between them, naturally segments demonstrated manipulations into\nsub-tasks. Notably, our method achieves consistent task decomposition across\nall demonstrations, which is desirable in contexts such as machine learning.\nUsing teleoperation, a common modality in imitation learning for robotic\nmanipulation, we collected demonstration data for various tasks, applied our\nsegmentation method, and evaluated the characteristics and consistency of the\nresulting sub-tasks. Furthermore, through extensive testing across a wide range\nof hyperparameter variations, we demonstrated that the proposed method\npossesses the robustness necessary for application to different robotic\nsystems.\n","authors":["Ryo Takizawa","Yoshiyuki Ohmura","Yasuo Kuniyoshi"],"pdf_url":"https://arxiv.org/pdf/2501.15071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15068v1","updated":"2025-01-25T04:19:33Z","published":"2025-01-25T04:19:33Z","title":"An Atomic Skill Library Construction Method for Data-Efficient Embodied\n Manipulation","summary":" Embodied manipulation is a fundamental ability in the realm of embodied\nartificial intelligence. Although current embodied manipulation models show\ncertain generalizations in specific settings, they struggle in new environments\nand tasks due to the complexity and diversity of real-world scenarios. The\ntraditional end-to-end data collection and training manner leads to significant\ndata demands, which we call ``data explosion''. To address the issue, we\nintroduce a three-wheeled data-driven method to build an atomic skill library.\nWe divide tasks into subtasks using the Vision-Language Planning (VLP). Then,\natomic skill definitions are formed by abstracting the subtasks. Finally, an\natomic skill library is constructed via data collection and\nVision-Language-Action (VLA) fine-tuning. As the atomic skill library expands\ndynamically with the three-wheel update strategy, the range of tasks it can\ncover grows naturally. In this way, our method shifts focus from end-to-end\ntasks to atomic skills, significantly reducing data costs while maintaining\nhigh performance and enabling efficient adaptation to new tasks. Extensive\nexperiments in real-world settings demonstrate the effectiveness and efficiency\nof our approach.\n","authors":["Dongjiang Li","Bo Peng","Chang Li","Ning Qiao","Qi Zheng","Lei Sun","Yusen Qin","Bangguo Li","Yifeng Luan","Yibing Zhan","Mingang Sun","Tong Xu","Lusong Li","Hui Shen","Xiaodong He"],"pdf_url":"https://arxiv.org/pdf/2501.15068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16087v5","updated":"2025-01-25T04:11:34Z","published":"2024-06-23T12:02:17Z","title":"Imperative Learning: A Self-supervised Neuro-Symbolic Learning Framework\n for Robot Autonomy","summary":" Data-driven methods such as reinforcement and imitation learning have\nachieved remarkable success in robot autonomy. However, their data-centric\nnature still hinders them from generalizing well to ever-changing environments.\nMoreover, collecting large datasets for robotic tasks is often impractical and\nexpensive. To overcome these challenges, we introduce a new self-supervised\nneuro-symbolic (NeSy) computational framework, imperative learning (IL), for\nrobot autonomy, leveraging the generalization abilities of symbolic reasoning.\nThe framework of IL consists of three primary components: a neural module, a\nreasoning engine, and a memory system. We formulate IL as a special bilevel\noptimization (BLO), which enables reciprocal learning over the three modules.\nThis overcomes the label-intensive obstacles associated with data-driven\napproaches and takes advantage of symbolic reasoning concerning logical\nreasoning, physical principles, geometric analysis, etc. We discuss several\noptimization techniques for IL and verify their effectiveness in five distinct\nrobot autonomy tasks including path planning, rule induction, optimal control,\nvisual odometry, and multi-robot routing. Through various experiments, we show\nthat IL can significantly enhance robot autonomy capabilities and we anticipate\nthat it will catalyze further research across diverse domains.\n","authors":["Chen Wang","Kaiyi Ji","Junyi Geng","Zhongqiang Ren","Taimeng Fu","Fan Yang","Yifan Guo","Haonan He","Xiangyu Chen","Zitong Zhan","Qiwei Du","Shaoshu Su","Bowen Li","Yuheng Qiu","Yi Du","Qihang Li","Yifan Yang","Xiao Lin","Zhipeng Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.16087v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05585v2","updated":"2025-01-25T03:16:03Z","published":"2024-10-08T00:58:42Z","title":"Towards Robust Spacecraft Trajectory Optimization via Transformers","summary":" Future multi-spacecraft missions require robust autonomous trajectory\noptimization capabilities to ensure safe and efficient rendezvous operations.\nThis capability hinges on solving non-convex optimal control problems in\nreal-time, although traditional iterative methods such as sequential convex\nprogramming impose significant computational challenges. To mitigate this\nburden, the Autonomous Rendezvous Transformer (ART) introduced a generative\nmodel trained to provide near-optimal initial guesses. This approach provides\nconvergence to better local optima (e.g., fuel optimality), improves\nfeasibility rates, and results in faster convergence speed of optimization\nalgorithms through warm-starting. This work extends the capabilities of ART to\naddress robust chance-constrained optimal control problems. Specifically, ART\nis applied to challenging rendezvous scenarios in Low Earth Orbit (LEO),\nensuring fault-tolerant behavior under uncertainty. Through extensive\nexperimentation, the proposed warm-starting strategy is shown to consistently\nproduce high-quality reference trajectories, achieving up to 30\\% cost\nimprovement and 50\\% reduction in infeasible cases compared to conventional\nmethods, demonstrating robust performance across multiple state\nrepresentations. Additionally, a post hoc evaluation framework is proposed to\nassess the quality of generated trajectories and mitigate runtime failures,\nmarking an initial step toward the reliable deployment of AI-driven solutions\nin safety-critical autonomous systems such as spacecraft.\n","authors":["Yuji Takubo","Tommaso Guffanti","Daniele Gammelli","Marco Pavone","Simone D'Amico"],"pdf_url":"https://arxiv.org/pdf/2410.05585v2.pdf","comment":"Submitted to the IEEE Aerospace Conference 2025. 13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2501.01831v2","updated":"2025-01-25T01:50:30Z","published":"2025-01-03T14:32:17Z","title":"Online Fault Tolerance Strategy for Abrupt Reachability Constraint\n Changes","summary":" When a system's constraints change abruptly, the system's reachability safety\ndoes no longer sustain. Thus, the system can reach a forbidden/dangerous value.\nConventional remedy practically involves online controller redesign (OCR) to\nre-establish the reachability's compliance with the new constraints, which,\nhowever, is usually too slow. There is a need for an online strategy capable of\nmanaging runtime changes in reachability constraints. However, to the best of\nthe authors' knowledge, this topic has not been addressed in the existing\nliterature. In this paper, we propose a fast fault tolerance strategy to\nrecover the system's reachability safety in runtime. Instead of redesigning the\nsystem's controller, we propose to change the system's reference state to\nmodify the system's reachability to comply with the new constraints. We frame\nthe reference state search as an optimization problem and employ the\nKarush-Kuhn-Tucker (KKT) method as well as the Interior Point Method (IPM)\nbased Newton's method (as a fallback for the KKT method) for fast solution\nderivation. The optimization also allows more future fault tolerance. Numerical\nsimulations demonstrate that our method outperforms the conventional OCR method\nin terms of computational efficiency and success rate. Specifically, the\nresults show that the proposed method finds a solution $10^{2}$ (with the IPM\nbased Newton's method) $\\sim 10^{4}$ (with the KKT method) times faster than\nthe OCR method. Additionally, the improvement rate of the success rate of our\nmethod over the OCR method is $40.81\\%$ without considering the deadline of run\ntime. The success rate remains at $49.44\\%$ for the proposed method, while it\nbecomes $0\\%$ for the OCR method when a deadline of $1.5 \\; seconds$ is\nimposed.\n","authors":["Henghua Shen","Qixin Wang"],"pdf_url":"https://arxiv.org/pdf/2501.01831v2.pdf","comment":"9 pages, 2 figures,"},{"id":"http://arxiv.org/abs/2405.14005v2","updated":"2025-01-25T00:38:28Z","published":"2024-05-22T21:22:44Z","title":"Neural Scaling Laws in Robotics","summary":" Neural scaling laws have driven significant advancements in machine learning,\nparticularly in domains like language modeling and computer vision. However,\nthe exploration of neural scaling laws within robotics has remained relatively\nunderexplored, despite the growing adoption of foundation models in this field.\nThis paper represents the first comprehensive study to quantify neural scaling\nlaws for Robot Foundation Models (RFMs) and Large Language Models (LLMs) in\nrobotics tasks. Through a meta-analysis of 327 research papers, we investigate\nhow data size, model size, and compute resources influence downstream\nperformance across a diverse set of robotic tasks. Consistent with previous\nscaling law research, our results reveal that the performance of robotic models\nimproves with increased resources, following a power-law relationship.\nPromisingly, the improvement in robotic task performance scales notably faster\nthan language tasks. This suggests that, while performance on downstream\nrobotic tasks today is often moderate-to-poor, increased data and compute are\nlikely to signficantly improve performance in the future. Also consistent with\nprevious scaling law research, we also observe the emergence of new robot\ncapabilities as models scale.\n","authors":["Sebastian Sartor","Neil Thompson"],"pdf_url":"https://arxiv.org/pdf/2405.14005v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.14992v1","updated":"2025-01-25T00:00:11Z","published":"2025-01-25T00:00:11Z","title":"Extensive Exploration in Complex Traffic Scenarios using Hierarchical\n Reinforcement Learning","summary":" Developing an automated driving system capable of navigating complex traffic\nenvironments remains a formidable challenge. Unlike rule-based or supervised\nlearning-based methods, Deep Reinforcement Learning (DRL) based controllers\neliminate the need for domain-specific knowledge and datasets, thus providing\nadaptability to various scenarios. Nonetheless, a common limitation of existing\nstudies on DRL-based controllers is their focus on driving scenarios with\nsimple traffic patterns, which hinders their capability to effectively handle\ncomplex driving environments with delayed, long-term rewards, thus compromising\nthe generalizability of their findings. In response to these limitations, our\nresearch introduces a pioneering hierarchical framework that efficiently\ndecomposes intricate decision-making problems into manageable and interpretable\nsubtasks. We adopt a two step training process that trains the high-level\ncontroller and low-level controller separately. The high-level controller\nexhibits an enhanced exploration potential with long-term delayed rewards, and\nthe low-level controller provides longitudinal and lateral control ability\nusing short-term instantaneous rewards. Through simulation experiments, we\ndemonstrate the superiority of our hierarchical controller in managing complex\nhighway driving situations.\n","authors":["Zhihao Zhang","Ekim Yurtsever","Keith A. Redmill"],"pdf_url":"https://arxiv.org/pdf/2501.14992v1.pdf","comment":null}]},"2025-01-28T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2402.18393v3","updated":"2025-01-28T17:36:51Z","published":"2024-02-28T15:13:33Z","title":"Decictor: Towards Evaluating the Robustness of Decision-Making in\n Autonomous Driving Systems","summary":" Autonomous Driving System (ADS) testing is crucial in ADS development, with\nthe current primary focus being on safety. However, the evaluation of\nnon-safety-critical performance, particularly the ADS's ability to make optimal\ndecisions and produce optimal paths for autonomous vehicles (AVs), is also\nvital to ensure the intelligence and reduce risks of AVs. Currently, there is\nlittle work dedicated to assessing the robustness of ADSs' path-planning\ndecisions (PPDs), i.e., whether an ADS can maintain the optimal PPD after an\ninsignificant change in the environment. The key challenges include the lack of\nclear oracles for assessing PPD optimality and the difficulty in searching for\nscenarios that lead to non-optimal PPDs. To fill this gap, in this paper, we\nfocus on evaluating the robustness of ADSs' PPDs and propose the first method,\nDecictor, for generating non-optimal decision scenarios (NoDSs), where the ADS\ndoes not plan optimal paths for AVs. Decictor comprises three main components:\nNon-invasive Mutation, Consistency Check, and Feedback. To overcome the oracle\nchallenge, Non-invasive Mutation is devised to implement conservative\nmodifications, ensuring the preservation of the original optimal path in the\nmutated scenarios. Subsequently, the Consistency Check is applied to determine\nthe presence of non-optimal PPDs by comparing the driving paths in the original\nand mutated scenarios. To deal with the challenge of large environment space,\nwe design Feedback metrics that integrate spatial and temporal dimensions of\nthe AV's movement. These metrics are crucial for effectively steering the\ngeneration of NoDSs. We evaluate Decictor on Baidu Apollo, an open-source and\nproduction-grade ADS. The experimental results validate the effectiveness of\nDecictor in detecting non-optimal PPDs of ADSs.\n","authors":["Mingfei Cheng","Yuan Zhou","Xiaofei Xie","Junjie Wang","Guozhu Meng","Kairui Yang"],"pdf_url":"https://arxiv.org/pdf/2402.18393v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.07688v2","updated":"2025-01-28T16:02:30Z","published":"2024-10-10T07:54:17Z","title":"PokeFlex: A Real-World Dataset of Volumetric Deformable Objects for\n Robotics","summary":" Data-driven methods have shown great potential in solving challenging\nmanipulation tasks; however, their application in the domain of deformable\nobjects has been constrained, in part, by the lack of data. To address this\nlack, we propose PokeFlex, a dataset featuring real-world multimodal data that\nis paired and annotated. The modalities include 3D textured meshes, point\nclouds, RGB images, and depth maps. Such data can be leveraged for several\ndownstream tasks, such as online 3D mesh reconstruction, and it can potentially\nenable underexplored applications such as the real-world deployment of\ntraditional control methods based on mesh simulations. To deal with the\nchallenges posed by real-world 3D mesh reconstruction, we leverage a\nprofessional volumetric capture system that allows complete 360{\\deg}\nreconstruction. PokeFlex consists of 18 deformable objects with varying\nstiffness and shapes. Deformations are generated by dropping objects onto a\nflat surface or by poking the objects with a robot arm. Interaction wrenches\nand contact locations are also reported for the latter case. Using different\ndata modalities, we demonstrated a use case for our dataset training models\nthat, given the novelty of the multimodal nature of Pokeflex, constitute the\nstate-of-the-art in multi-object online template-based mesh reconstruction from\nmultimodal data, to the best of our knowledge. We refer the reader to our\nwebsite ( https://pokeflex-dataset.github.io/ ) for further demos and examples.\n","authors":["Jan Obrist","Miguel Zamora","Hehui Zheng","Ronan Hinchet","Firat Ozdemir","Juan Zarate","Robert K. Katzschmann","Stelian Coros"],"pdf_url":"https://arxiv.org/pdf/2410.07688v2.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2501.17022v1","updated":"2025-01-28T15:39:07Z","published":"2025-01-28T15:39:07Z","title":"Mobile Manipulation Instruction Generation from Multiple Images with\n Automatic Metric Enhancement","summary":" We consider the problem of generating free-form mobile manipulation\ninstructions based on a target object image and receptacle image. Conventional\nimage captioning models are not able to generate appropriate instructions\nbecause their architectures are typically optimized for single-image. In this\nstudy, we propose a model that handles both the target object and receptacle to\ngenerate free-form instruction sentences for mobile manipulation tasks.\nMoreover, we introduce a novel training method that effectively incorporates\nthe scores from both learning-based and n-gram based automatic evaluation\nmetrics as rewards. This method enables the model to learn the co-occurrence\nrelationships between words and appropriate paraphrases. Results demonstrate\nthat our proposed method outperforms baseline methods including representative\nmultimodal large language models on standard automatic evaluation metrics.\nMoreover, physical experiments reveal that using our method to augment data on\nlanguage instructions improves the performance of an existing multimodal\nlanguage understanding model for mobile manipulation.\n","authors":["Kei Katsumata","Motonari Kambara","Daichi Yashima","Ryosuke Korekata","Komei Sugiura"],"pdf_url":"https://arxiv.org/pdf/2501.17022v1.pdf","comment":"Accepted for IEEE RA-L 2025"},{"id":"http://arxiv.org/abs/2501.17018v1","updated":"2025-01-28T15:32:41Z","published":"2025-01-28T15:32:41Z","title":"Six-Degree-of-Freedom Motion Emulation for Data-Driven Modeling of\n Underwater Vehicles","summary":" This article presents a collaborative research effort aimed at developing a\nnovel six-degree-of-freedom (6-DOF) motion platform for the empirical\ncharacterization of hydrodynamic forces crucial for the control and stability\nof surface and subsurface vehicles. Traditional experimental methods, such as\nthe Planar Motion Mechanism (PMM), are limited by the number of simultaneously\narticulated DOFs and are limited to single-frequency testing, making such\nsystems impractical for resolving frequency-dependent added mass or damping\nmatrices. The 6 DOF platform, termed a hexapod, overcomes these limitations by\noffering enhanced maneuverability and the ability to test broad-banded\nfrequency spectra in multiple degrees of freedom in a single experiment.\n","authors":["Juliana Danesi Ruiz","Michael Swafford","Austin Krebill","Rachel Vitali","Casey Harwood"],"pdf_url":"https://arxiv.org/pdf/2501.17018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.17015v1","updated":"2025-01-28T15:26:25Z","published":"2025-01-28T15:26:25Z","title":"Revisit Mixture Models for Multi-Agent Simulation: Experimental Study\n within a Unified Framework","summary":" Simulation plays a crucial role in assessing autonomous driving systems,\nwhere the generation of realistic multi-agent behaviors is a key aspect. In\nmulti-agent simulation, the primary challenges include behavioral multimodality\nand closed-loop distributional shifts. In this study, we revisit mixture models\nfor generating multimodal agent behaviors, which can cover the mainstream\nmethods including continuous mixture models and GPT-like discrete models.\nFurthermore, we introduce a closed-loop sample generation approach tailored for\nmixture models to mitigate distributional shifts. Within the unified mixture\nmodel~(UniMM) framework, we recognize critical configurations from both model\nand data perspectives. We conduct a systematic examination of various model\nconfigurations, including positive component matching, continuous regression,\nprediction horizon, and the number of components. Moreover, our investigation\ninto the data configuration highlights the pivotal role of closed-loop samples\nin achieving realistic simulations. To extend the benefits of closed-loop\nsamples across a broader range of mixture models, we further address the\nshortcut learning and off-policy learning issues. Leveraging insights from our\nexploration, the distinct variants proposed within the UniMM framework,\nincluding discrete, anchor-free, and anchor-based models, all achieve\nstate-of-the-art performance on the WOSAC benchmark.\n","authors":["Longzhong Lin","Xuewu Lin","Kechun Xu","Haojian Lu","Lichao Huang","Rong Xiong","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2501.17015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16997v1","updated":"2025-01-28T14:52:10Z","published":"2025-01-28T14:52:10Z","title":"MAUCell: An Adaptive Multi-Attention Framework for Video Frame\n Prediction","summary":" Temporal sequence modeling stands as the fundamental foundation for video\nprediction systems and real-time forecasting operations as well as anomaly\ndetection applications. The achievement of accurate predictions through\nefficient resource consumption remains an ongoing issue in contemporary\ntemporal sequence modeling. We introduce the Multi-Attention Unit (MAUCell)\nwhich combines Generative Adversarial Networks (GANs) and spatio-temporal\nattention mechanisms to improve video frame prediction capabilities. Our\napproach implements three types of attention models to capture intricate motion\nsequences. A dynamic combination of these attention outputs allows the model to\nreach both advanced decision accuracy along with superior quality while\nremaining computationally efficient. The integration of GAN elements makes\ngenerated frames appear more true to life therefore the framework creates\noutput sequences which mimic real-world footage. The new design system\nmaintains equilibrium between temporal continuity and spatial accuracy to\ndeliver reliable video prediction. Through a comprehensive evaluation\nmethodology which merged the perceptual LPIPS measurement together with classic\ntests MSE, MAE, SSIM and PSNR exhibited enhancing capabilities than\ncontemporary approaches based on direct benchmark tests of Moving MNIST, KTH\nAction, and CASIA-B (Preprocessed) datasets. Our examination indicates that\nMAUCell shows promise for operational time requirements. The research findings\ndemonstrate how GANs work best with attention mechanisms to create better\napplications for predicting video sequences.\n","authors":["Shreyam Gupta","P. Agrawal","Priyam Gupta"],"pdf_url":"https://arxiv.org/pdf/2501.16997v1.pdf","comment":"This work has been submitted to the IJCAI 2025 Conference for review.\n It contains: 11 pages, 4 figures, 7 tables, and 3 Algorithms"},{"id":"http://arxiv.org/abs/2501.16973v1","updated":"2025-01-28T14:14:02Z","published":"2025-01-28T14:14:02Z","title":"Towards Open-Source and Modular Space Systems with ATMOS","summary":" In the near future, autonomous space systems will compose a large number of\nthe spacecraft being deployed. Their tasks will involve autonomous rendezvous\nand proximity operations with large structures, such as inspections or assembly\nof orbiting space stations and maintenance and human-assistance tasks over\nshared workspaces. To promote replicable and reliable scientific results for\nautonomous control of spacecraft, we present the design of a space systems\nlaboratory based on open-source and modular software and hardware. The\nsimulation software provides a software-in-the-loop (SITL) architecture that\nseamlessly transfers simulated results to the ATMOS platforms, developed for\ntesting of multi-agent autonomy schemes for microgravity. The manuscript\npresents the KTH space systems laboratory facilities and the ATMOS platform as\nopen-source hardware and software contributions. Preliminary results showcase\nSITL and real testing.\n","authors":["Pedro Roque","Sujet Phodapol","Elias Krantz","Jaeyoung Lim","Joris Verhagen","Frank Jiang","David Dorner","Roland Siegwart","Ivan Stenius","Gunnar Tibert","Huina Mao","Jana Tumova","Christer Fuglesang","Dimos V. Dimarogonas"],"pdf_url":"https://arxiv.org/pdf/2501.16973v1.pdf","comment":"Preliminary release, to be submitted"},{"id":"http://arxiv.org/abs/2501.16947v1","updated":"2025-01-28T13:46:01Z","published":"2025-01-28T13:46:01Z","title":"Image-based Geo-localization for Robotics: Are Black-box Vision-Language\n Models there yet?","summary":" The advances in Vision-Language models (VLMs) offer exciting opportunities\nfor robotic applications involving image geo-localization, the problem of\nidentifying the geo-coordinates of a place based on visual data only. Recent\nresearch works have focused on using a VLM as embeddings extractor for\ngeo-localization, however, the most sophisticated VLMs may only be available as\nblack boxes that are accessible through an API, and come with a number of\nlimitations: there is no access to training data, model features and gradients;\nretraining is not possible; the number of predictions may be limited by the\nAPI; training on model outputs is often prohibited; and queries are open-ended.\nThe utilization of a VLM as a stand-alone, zero-shot geo-localization system\nusing a single text-based prompt is largely unexplored. To bridge this gap,\nthis paper undertakes the first systematic study, to the best of our knowledge,\nto investigate the potential of some of the state-of-the-art VLMs as\nstand-alone, zero-shot geo-localization systems in a black-box setting with\nrealistic constraints. We consider three main scenarios for this thorough\ninvestigation: a) fixed text-based prompt; b) semantically-equivalent\ntext-based prompts; and c) semantically-equivalent query images. We also take\ninto account the auto-regressive and probabilistic generation process of the\nVLMs when investigating their utility for geo-localization task by using model\nconsistency as a metric in addition to traditional accuracy. Our work provides\nnew insights in the capabilities of different VLMs for the above-mentioned\nscenarios.\n","authors":["Sania Waheed","Bruno Ferrarini","Michael Milford","Sarvapali D. Ramchurn","Shoaib Ehsan"],"pdf_url":"https://arxiv.org/pdf/2501.16947v1.pdf","comment":"Submitted to IROS 2025"},{"id":"http://arxiv.org/abs/2501.16929v1","updated":"2025-01-28T13:18:27Z","published":"2025-01-28T13:18:27Z","title":"Giving Sense to Inputs: Toward an Accessible Control Framework for\n Shared Autonomy","summary":" While shared autonomy offers significant potential for assistive robotics,\nkey questions remain about how to effectively map 2D control inputs to 6D robot\nmotions. An intuitive framework should allow users to input commands\neffortlessly, with the robot responding as expected, without users needing to\nanticipate the impact of their inputs. In this article, we propose a dynamic\ninput mapping framework that links joystick movements to motions on control\nframes defined along a trajectory encoded with canal surfaces. We evaluate our\nmethod in a user study with 20 participants, demonstrating that our input\nmapping framework reduces the workload and improves usability compared to a\nbaseline mapping with similar motion encoding. To prepare for deployment in\nassistive scenarios, we built on the development from the accessible gaming\ncommunity to select an accessible control interface. We then tested the system\nin an exploratory study, where three wheelchair users controlled the robot for\nboth daily living activities and a creative painting task, demonstrating its\nfeasibility for users closer to our target population.\n","authors":["Shalutha Rajapakshe","Jean-Marc Odobez","Emmanuel Senft"],"pdf_url":"https://arxiv.org/pdf/2501.16929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16899v1","updated":"2025-01-28T12:35:06Z","published":"2025-01-28T12:35:06Z","title":"RDMM: Fine-Tuned LLM Models for On-Device Robotic Decision Making with\n Enhanced Contextual Awareness in Specific Domains","summary":" Large language models (LLMs) represent a significant advancement in\nintegrating physical robots with AI-driven systems. We showcase the\ncapabilities of our framework within the context of the real-world household\ncompetition. This research introduces a framework that utilizes RDMM (Robotics\nDecision-Making Models), which possess the capacity for decision-making within\ndomain-specific contexts, as well as an awareness of their personal knowledge\nand capabilities. The framework leverages information to enhance the autonomous\ndecision-making of the system. In contrast to other approaches, our focus is on\nreal-time, on-device solutions, successfully operating on hardware with as\nlittle as 8GB of memory. Our framework incorporates visual perception models\nequipping robots with understanding of their environment. Additionally, the\nframework has integrated real-time speech recognition capabilities, thus\nenhancing the human-robot interaction experience. Experimental results\ndemonstrate that the RDMM framework can plan with an 93\\% accuracy.\nFurthermore, we introduce a new dataset consisting of 27k planning instances,\nas well as 1.3k text-image annotated samples derived from the competition. The\nframework, benchmarks, datasets, and models developed in this work are publicly\navailable on our GitHub repository at https://github.com/shadynasrat/RDMM.\n","authors":["Shady Nasrat","Myungsu Kim","Seonil Lee","Jiho Lee","Yeoncheol Jang","Seung-joon Yi"],"pdf_url":"https://arxiv.org/pdf/2501.16899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16868v1","updated":"2025-01-28T11:39:02Z","published":"2025-01-28T11:39:02Z","title":"Event-Based Adaptive Koopman Framework for Optic Flow-Guided Landing on\n Moving Platforms","summary":" This paper presents an optic flow-guided approach for achieving soft landings\nby resource-constrained unmanned aerial vehicles (UAVs) on dynamic platforms.\nAn offline data-driven linear model based on Koopman operator theory is\ndeveloped to describe the underlying (nonlinear) dynamics of optic flow output\nobtained from a single monocular camera that maps to vehicle acceleration as\nthe control input. Moreover, a novel adaptation scheme within the Koopman\nframework is introduced online to handle uncertainties such as unknown platform\nmotion and ground effect, which exert a significant influence during the\nterminal stage of the descent process. Further, to minimize computational\noverhead, an event-based adaptation trigger is incorporated into an\nevent-driven Model Predictive Control (MPC) strategy to regulate optic flow and\ntrack a desired reference. A detailed convergence analysis ensures global\nconvergence of the tracking error to a uniform ultimate bound while ensuring\nZeno-free behavior. Simulation results demonstrate the algorithm's robustness\nand effectiveness in landing on dynamic platforms under ground effect and\nsensor noise, which compares favorably to non-adaptive event-triggered and\ntime-triggered adaptive schemes.\n","authors":["Bazeela Banday","Chandan Kumar Sah","Jishnu Keshavan"],"pdf_url":"https://arxiv.org/pdf/2501.16868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07594v2","updated":"2025-01-28T09:32:08Z","published":"2024-04-11T09:23:44Z","title":"Weakly-Supervised Learning via Multi-Lateral Decoder Branching for Tool\n Segmentation in Robot-Assisted Cardiovascular Catheterization","summary":" Robot-assisted catheterization has garnered a good attention for its\npotentials in treating cardiovascular diseases. However, advancing\nsurgeon-robot collaboration still requires further research, particularly on\ntask-specific automation. For instance, automated tool segmentation can assist\nsurgeons in visualizing and tracking of endovascular tools during cardiac\nprocedures. While learning-based models have demonstrated state-of-the-art\nsegmentation performances, generating ground-truth labels for fully-supervised\nmethods is both labor-intensive time consuming, and costly. In this study, we\npropose a weakly-supervised learning method with multi-lateral pseudo labeling\nfor tool segmentation in cardiovascular angiogram datasets. The method utilizes\na modified U-Net architecture featuring one encoder and multiple laterally\nbranched decoders. The decoders generate diverse pseudo labels under different\nperturbations, augmenting available partial labels. The pseudo labels are\nself-generated using a mixed loss function with shared consistency across the\ndecoders. The weakly-supervised model was trained end-to-end and validated\nusing partially annotated angiogram data from three cardiovascular\ncatheterization procedures. Validation results show that the model could\nperform closer to fully-supervised models. Also, the proposed weakly-supervised\nmulti-lateral method outperforms three well known methods used for\nweakly-supervised learning, offering the highest segmentation performance\nacross the three angiogram datasets. Furthermore, numerous ablation studies\nconfirmed the model's consistent performance under different parameters.\nFinally, the model was applied for tool segmentation in a robot-assisted\ncatheterization experiments. The model enhanced visualization with high\nconnectivity indices for guidewire and catheter, and a mean processing time of\n35 ms per frame.\n","authors":["Olatunji Mumini Omisore","Toluwanimi Akinyemi","Anh Nguyen","Lei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07594v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15830v2","updated":"2025-01-28T09:25:31Z","published":"2025-01-27T07:34:33Z","title":"SpatialVLA: Exploring Spatial Representations for Visual-Language-Action\n Model","summary":" In this paper, we claim that spatial understanding is the keypoint in robot\nmanipulation, and propose SpatialVLA to explore effective spatial\nrepresentations for the robot foundation model. Specifically, we introduce\nEgo3D Position Encoding to inject 3D information into the input observations of\nthe visual-language-action model, and propose Adaptive Action Grids to\nrepresent spatial robot movement actions with adaptive discretized action\ngrids, facilitating learning generalizable and transferrable spatial action\nknowledge for cross-robot control. SpatialVLA is first pre-trained on top of a\nvision-language model with 1.1 Million real-world robot episodes, to learn a\ngeneralist manipulation policy across multiple robot environments and tasks.\nAfter pre-training, SpatialVLA is directly applied to perform numerous tasks in\na zero-shot manner. The superior results in both simulation and real-world\nrobots demonstrate its advantage of inferring complex robot motion trajectories\nand its strong in-domain multi-task generalization ability. We further show the\nproposed Adaptive Action Grids offer a new and effective way to fine-tune the\npre-trained SpatialVLA model for new simulation and real-world setups, where\nthe pre-learned action grids are re-discretized to capture robot-specific\nspatial action movements of new setups. The superior results from extensive\nevaluations demonstrate the exceptional in-distribution generalization and\nout-of-distribution adaptation capability, highlighting the crucial benefit of\nthe proposed spatial-aware representations for generalist robot policy\nlearning. All the details and codes will be open-sourced.\n","authors":["Delin Qu","Haoming Song","Qizhi Chen","Yuanqi Yao","Xinyi Ye","Yan Ding","Zhigang Wang","JiaYuan Gu","Bin Zhao","Dong Wang","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2501.15830v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10991v2","updated":"2025-01-28T09:21:46Z","published":"2025-01-19T09:23:57Z","title":"Front Hair Styling Robot System Using Path Planning for Root-Centric\n Strand Adjustment","summary":" Hair styling is a crucial aspect of personal grooming, significantly\ninfluenced by the appearance of front hair. While brushing is commonly used\nboth to detangle hair and for styling purposes, existing research primarily\nfocuses on robotic systems for detangling hair, with limited exploration into\nrobotic hair styling. This research presents a novel robotic system designed to\nautomatically adjust front hairstyles, with an emphasis on path planning for\nroot-centric strand adjustment. The system utilizes images to compare the\ncurrent hair state with the desired target state through an orientation map of\nhair strands. By concentrating on the differences in hair orientation and\nspecifically targeting adjustments at the root of each strand, the system\nperforms detailed styling tasks. The path planning approach ensures effective\nalignment of the hairstyle with the target, and a closed-loop mechanism refines\nthese adjustments to accurately evolve the hairstyle towards the desired\noutcome. Experimental results demonstrate that the proposed system achieves a\nhigh degree of similarity and consistency in front hair styling, showing\npromising results for automated, precise hairstyle adjustments.\n","authors":["Soonhyo Kim","Naoaki Kanazawa","Shun Hasegawa","Kento Kawaharazuka","Kei Okada"],"pdf_url":"https://arxiv.org/pdf/2501.10991v2.pdf","comment":"Accepted at IEEE/SICE SII2025"},{"id":"http://arxiv.org/abs/2501.16803v1","updated":"2025-01-28T09:08:31Z","published":"2025-01-28T09:08:31Z","title":"RG-Attn: Radian Glue Attention for Multi-modality Multi-agent\n Cooperative Perception","summary":" Cooperative perception offers an optimal solution to overcome the perception\nlimitations of single-agent systems by leveraging Vehicle-to-Everything (V2X)\ncommunication for data sharing and fusion across multiple agents. However, most\nexisting approaches focus on single-modality data exchange, limiting the\npotential of both homogeneous and heterogeneous fusion across agents. This\noverlooks the opportunity to utilize multi-modality data per agent, restricting\nthe system's performance. In the automotive industry, manufacturers adopt\ndiverse sensor configurations, resulting in heterogeneous combinations of\nsensor modalities across agents. To harness the potential of every possible\ndata source for optimal performance, we design a robust LiDAR and camera\ncross-modality fusion module, Radian-Glue-Attention (RG-Attn), applicable to\nboth intra-agent cross-modality fusion and inter-agent cross-modality fusion\nscenarios, owing to the convenient coordinate conversion by transformation\nmatrix and the unified sampling/inversion mechanism. We also propose two\ndifferent architectures, named Paint-To-Puzzle (PTP) and\nCo-Sketching-Co-Coloring (CoS-CoCo), for conducting cooperative perception. PTP\naims for maximum precision performance and achieves smaller data packet size by\nlimiting cross-agent fusion to a single instance, but requiring all\nparticipants to be equipped with LiDAR. In contrast, CoS-CoCo supports agents\nwith any configuration-LiDAR-only, camera-only, or LiDAR-camera-both,\npresenting more generalization ability. Our approach achieves state-of-the-art\n(SOTA) performance on both real and simulated cooperative perception datasets.\nThe code will be released at GitHub in early 2025.\n","authors":["Lantao Li","Kang Yang","Wenqi Zhang","Xiaoxue Wang","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2501.16803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16800v1","updated":"2025-01-28T09:05:03Z","published":"2025-01-28T09:05:03Z","title":"DIRIGENt: End-To-End Robotic Imitation of Human Demonstrations Based on\n a Diffusion Model","summary":" There has been substantial progress in humanoid robots, with new skills\ncontinuously being taught, ranging from navigation to manipulation. While these\nabilities may seem impressive, the teaching methods often remain inefficient.\nTo enhance the process of teaching robots, we propose leveraging a mechanism\neffectively used by humans: teaching by demonstrating. In this paper, we\nintroduce DIRIGENt (DIrect Robotic Imitation GENeration model), a novel\nend-to-end diffusion approach that directly generates joint values from\nobserving human demonstrations, enabling a robot to imitate these actions\nwithout any existing mapping between it and humans. We create a dataset in\nwhich humans imitate a robot and then use this collected data to train a\ndiffusion model that enables a robot to imitate humans. The following three\naspects are the core of our contribution. First is our novel dataset with\nnatural pairs between human and robot poses, allowing our approach to imitate\nhumans accurately despite the gap between their anatomies. Second, the\ndiffusion input to our model alleviates the challenge of redundant joint\nconfigurations, limiting the search space. And finally, our end-to-end\narchitecture from perception to action leads to an improved learning\ncapability. Through our experimental analysis, we show that combining these\nthree aspects allows DIRIGENt to outperform existing state-of-the-art\napproaches in the field of generating joint values from RGB images.\n","authors":["Josua Spisak","Matthias Kerzel","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2501.16800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16754v1","updated":"2025-01-28T07:15:39Z","published":"2025-01-28T07:15:39Z","title":"SSF-PAN: Semantic Scene Flow-Based Perception for Autonomous Navigation\n in Traffic Scenarios","summary":" Vehicle detection and localization in complex traffic scenarios pose\nsignificant challenges due to the interference of moving objects. Traditional\nmethods often rely on outlier exclusions or semantic segmentations, which\nsuffer from low computational efficiency and accuracy. The proposed SSF-PAN can\nachieve the functionalities of LiDAR point cloud based object\ndetection/localization and SLAM (Simultaneous Localization and Mapping) with\nhigh computational efficiency and accuracy, enabling map-free navigation\nframeworks. The novelty of this work is threefold: 1) developing a neural\nnetwork which can achieve segmentation among static and dynamic objects within\nthe scene flows with different motion features, that is, semantic scene flow\n(SSF); 2) developing an iterative framework which can further optimize the\nquality of input scene flows and output segmentation results; 3) developing a\nscene flow-based navigation platform which can test the performance of the SSF\nperception system in the simulation environment. The proposed SSF-PAN method is\nvalidated using the SUScape-CARLA and the KITTI datasets, as well as on the\nCARLA simulator. Experimental results demonstrate that the proposed approach\noutperforms traditional methods in terms of scene flow computation accuracy,\nmoving object detection accuracy, computational efficiency, and autonomous\nnavigation effectiveness.\n","authors":["Yinqi Chen","Meiying Zhang","Qi Hao","Guang Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.16754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16743v1","updated":"2025-01-28T06:40:29Z","published":"2025-01-28T06:40:29Z","title":"Hierarchical Trajectory (Re)Planning for a Large Scale Swarm","summary":" We consider the trajectory replanning problem for a large-scale swarm in a\ncluttered environment. Our path planner replans for robots by utilizing a\nhierarchical approach, dividing the workspace, and computing collision-free\npaths for robots within each cell in parallel. Distributed trajectory\noptimization generates a deadlock-free trajectory for efficient execution and\nmaintains the control feasibility even when the optimization fails. Our\nhierarchical approach combines the benefits of both centralized and\ndecentralized methods, achieving a high task success rate while providing\nreal-time replanning capability. Compared to decentralized approaches, our\napproach effectively avoids deadlocks and collisions, significantly increasing\nthe task success rate. We demonstrate the real-time performance of our\nalgorithm with up to 142 robots in simulation, and a representative 24 physical\nCrazyflie nano-quadrotor experiment.\n","authors":["Lishuo Pan","Yutong Wang","Nora Ayanian"],"pdf_url":"https://arxiv.org/pdf/2501.16743v1.pdf","comment":"13 pages, 14 figures. arXiv admin note: substantial text overlap with\n arXiv:2407.02777"},{"id":"http://arxiv.org/abs/2501.16733v1","updated":"2025-01-28T06:18:29Z","published":"2025-01-28T06:18:29Z","title":"Dream to Drive with Predictive Individual World Model","summary":" It is still a challenging topic to make reactive driving behaviors in complex\nurban environments as road users' intentions are unknown. Model-based\nreinforcement learning (MBRL) offers great potential to learn a reactive policy\nby constructing a world model that can provide informative states and\nimagination training. However, a critical limitation in relevant research lies\nin the scene-level reconstruction representation learning, which may overlook\nkey interactive vehicles and hardly model the interactive features among\nvehicles and their long-term intentions. Therefore, this paper presents a novel\nMBRL method with a predictive individual world model (PIWM) for autonomous\ndriving. PIWM describes the driving environment from an individual-level\nperspective and captures vehicles' interactive relations and their intentions\nvia trajectory prediction task. Meanwhile, a behavior policy is learned jointly\nwith PIWM. It is trained in PIWM's imagination and effectively navigates in the\nurban driving scenes leveraging intention-aware latent states. The proposed\nmethod is trained and evaluated on simulation environments built upon\nreal-world challenging interactive scenarios. Compared with popular model-free\nand state-of-the-art model-based reinforcement learning methods, experimental\nresults show that the proposed method achieves the best performance in terms of\nsafety and efficiency.\n","authors":["Yinfeng Gao","Qichao Zhang","Da-wei Ding","Dongbin Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.16733v1.pdf","comment":"Codes: https://github.com/gaoyinfeng/PIWM"},{"id":"http://arxiv.org/abs/2501.16728v1","updated":"2025-01-28T06:12:24Z","published":"2025-01-28T06:12:24Z","title":"Optimizing Efficiency of Mixed Traffic through Reinforcement Learning: A\n Topology-Independent Approach and Benchmark","summary":" This paper presents a mixed traffic control policy designed to optimize\ntraffic efficiency across diverse road topologies, addressing issues of\ncongestion prevalent in urban environments. A model-free reinforcement learning\n(RL) approach is developed to manage large-scale traffic flow, using data\ncollected by autonomous vehicles to influence human-driven vehicles. A\nreal-world mixed traffic control benchmark is also released, which includes 444\nscenarios from 20 countries, representing a wide geographic distribution and\ncovering a variety of scenarios and road topologies. This benchmark serves as a\nfoundation for future research, providing a realistic simulation environment\nfor the development of effective policies. Comprehensive experiments\ndemonstrate the effectiveness and adaptability of the proposed method,\nachieving better performance than existing traffic control methods in both\nintersection and roundabout scenarios. To the best of our knowledge, this is\nthe first project to introduce a real-world complex scenarios mixed traffic\ncontrol benchmark. Videos and code of our work are available at\nhttps://sites.google.com/berkeley.edu/mixedtrafficplus/home\n","authors":["Chuyang Xiao","Dawei Wang","Xinzheng Tang","Jia Pan","Yuexin Ma"],"pdf_url":"https://arxiv.org/pdf/2501.16728v1.pdf","comment":"accepted to ICRA 2025"},{"id":"http://arxiv.org/abs/2501.16719v1","updated":"2025-01-28T05:53:29Z","published":"2025-01-28T05:53:29Z","title":"Safety-Critical Control for Aerial Physical Interaction in Uncertain\n Environment","summary":" Aerial manipulation for safe physical interaction with their environments is\ngaining significant momentum in robotics research. In this paper, we present a\ndisturbance-observer-based safety-critical control for a fully actuated aerial\nmanipulator interacting with both static and dynamic structures. Our approach\ncenters on a safety filter that dynamically adjusts the desired trajectory of\nthe vehicle's pose, accounting for the aerial manipulator's dynamics, the\ndisturbance observer's structure, and motor thrust limits. We provide rigorous\nproof that the proposed safety filter ensures the forward invariance of the\nsafety set - representing motor thrust limits - even in the presence of\ndisturbance estimation errors. To demonstrate the superiority of our method\nover existing control strategies for aerial physical interaction, we perform\ncomparative experiments involving complex tasks, such as pushing against a\nstatic structure and pulling a plug firmly attached to an electric socket.\nFurthermore, to highlight its repeatability in scenarios with sudden dynamic\nchanges, we perform repeated tests of pushing a movable cart and extracting a\nplug from a socket. These experiments confirm that our method not only\noutperforms existing methods but also excels in handling tasks with rapid\ndynamic variations.\n","authors":["Jeonghyun Byun","Yeonjoon Kim","Dongjae Lee","H. Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2501.16719v1.pdf","comment":"to be presented in 2025 IEEE International Conference on Robotics and\n Automation (ICRA), Atlanta, USA, 2025"},{"id":"http://arxiv.org/abs/2501.16717v1","updated":"2025-01-28T05:49:35Z","published":"2025-01-28T05:49:35Z","title":"Strawberry Robotic Operation Interface: An Open-Source Device for\n Collecting Dexterous Manipulation Data in Robotic Strawberry Farming","summary":" The strawberry farming is labor-intensive, particularly in tasks requiring\ndexterous manipulation such as picking occluded strawberries. To address this\nchallenge, we present the Strawberry Robotic Operation Interface (SROI), an\nopen-source device designed for collecting dexterous manipulation data in\nrobotic strawberry farming. The SROI features a handheld unit with a modular\nend effector, a stereo robotic camera, enabling the easy collection of\ndemonstration data in field environments. A data post-processing pipeline is\nintroduced to extract spatial trajectories and gripper states from the\ncollected data. Additionally, we release an open-source dataset of strawberry\npicking demonstrations to facilitate research in dexterous robotic\nmanipulation. The SROI represents a step toward automating complex strawberry\nfarming tasks, reducing reliance on manual labor.\n","authors":["Linsheng Hou","Wenwu Lu","Yanan Wang","Chen Peng","Zhenghao Fei"],"pdf_url":"https://arxiv.org/pdf/2501.16717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16698v1","updated":"2025-01-28T04:31:19Z","published":"2025-01-28T04:31:19Z","title":"3D-MoE: A Mixture-of-Experts Multi-modal LLM for 3D Vision and Pose\n Diffusion via Rectified Flow","summary":" 3D vision and spatial reasoning have long been recognized as preferable for\naccurately perceiving our three-dimensional world, especially when compared\nwith traditional visual reasoning based on 2D images. Due to the difficulties\nin collecting high-quality 3D data, research in this area has only recently\ngained momentum. With the advent of powerful large language models (LLMs),\nmulti-modal LLMs for 3D vision have been developed over the past few years.\nHowever, most of these models focus primarily on the vision encoder for 3D\ndata. In this paper, we propose converting existing densely activated LLMs into\nmixture-of-experts (MoE) models, which have proven effective for multi-modal\ndata processing. In addition to leveraging these models' instruction-following\ncapabilities, we further enable embodied task planning by attaching a diffusion\nhead, Pose-DiT, that employs a novel rectified flow diffusion scheduler.\nExperimental results on 3D question answering and task-planning tasks\ndemonstrate that our 3D-MoE framework achieves improved performance with fewer\nactivated parameters.\n","authors":["Yueen Ma","Yuzheng Zhuang","Jianye Hao","Irwin King"],"pdf_url":"https://arxiv.org/pdf/2501.16698v1.pdf","comment":"Preprint. Work in progress"},{"id":"http://arxiv.org/abs/2501.16664v1","updated":"2025-01-28T02:53:48Z","published":"2025-01-28T02:53:48Z","title":"Improving Vision-Language-Action Model with Online Reinforcement\n Learning","summary":" Recent studies have successfully integrated large vision-language models\n(VLMs) into low-level robotic control by supervised fine-tuning (SFT) with\nexpert robotic datasets, resulting in what we term vision-language-action (VLA)\nmodels. Although the VLA models are powerful, how to improve these large models\nduring interaction with environments remains an open question. In this paper,\nwe explore how to further improve these VLA models via Reinforcement Learning\n(RL), a commonly used fine-tuning technique for large models. However, we find\nthat directly applying online RL to large VLA models presents significant\nchallenges, including training instability that severely impacts the\nperformance of large models, and computing burdens that exceed the capabilities\nof most local machines. To address these challenges, we propose iRe-VLA\nframework, which iterates between Reinforcement Learning and Supervised\nLearning to effectively improve VLA models, leveraging the exploratory benefits\nof RL while maintaining the stability of supervised learning. Experiments in\ntwo simulated benchmarks and a real-world manipulation suite validate the\neffectiveness of our method.\n","authors":["Yanjiang Guo","Jianke Zhang","Xiaoyu Chen","Xiang Ji","Yen-Jen Wang","Yucheng Hu","Jianyu Chen"],"pdf_url":"https://arxiv.org/pdf/2501.16664v1.pdf","comment":"Accepted to ICRA 2025"},{"id":"http://arxiv.org/abs/2403.02508v3","updated":"2025-01-28T02:12:38Z","published":"2024-03-04T21:54:51Z","title":"Collision Avoidance and Geofencing for Fixed-wing Aircraft with Control\n Barrier Functions","summary":" Safety-critical failures often have fatal consequences in aerospace control.\nControl systems on aircraft, therefore, must ensure the strict satisfaction of\nsafety constraints, preferably with formal guarantees of safe behavior. This\npaper establishes the safety-critical control of fixed-wing aircraft in\ncollision avoidance and geofencing tasks. A control framework is developed\nwherein a run-time assurance (RTA) system modulates the nominal flight\ncontroller of the aircraft whenever necessary to prevent it from colliding with\nother aircraft or crossing a boundary (geofence) in space. The RTA is\nformulated as a safety filter using control barrier functions (CBFs) with\nformal guarantees of safe behavior. CBFs are constructed and compared for a\nnonlinear kinematic fixed-wing aircraft model. The proposed CBF-based\ncontrollers showcase the capability of safely executing simultaneous collision\navoidance and geofencing, as demonstrated by simulations on the kinematic model\nand a high-fidelity dynamical model.\n","authors":["Tamas G. Molnar","Suresh K. Kannan","James Cunningham","Kyle Dunlap","Kerianne L. Hobbs","Aaron D. Ames"],"pdf_url":"https://arxiv.org/pdf/2403.02508v3.pdf","comment":"Accepted to the IEEE Transactions on Control System Technology. 15\n pages, 7 figures"},{"id":"http://arxiv.org/abs/2410.17524v3","updated":"2025-01-28T01:46:20Z","published":"2024-10-23T03:01:43Z","title":"Mechanisms and Computational Design of Multi-Modal End-Effector with\n Force Sensing using Gated Networks","summary":" In limbed robotics, end-effectors must serve dual functions, such as both\nfeet for locomotion and grippers for grasping, which presents design\nchallenges. This paper introduces a multi-modal end-effector capable of\ntransitioning between flat and line foot configurations while providing\ngrasping capabilities. MAGPIE integrates 8-axis force sensing using proposed\nmechanisms with hall effect sensors, enabling both contact and tactile force\nmeasurements. We present a computational design framework for our sensing\nmechanism that accounts for noise and interference, allowing for desired\nsensitivity and force ranges and generating ideal inverse models. The hardware\nimplementation of MAGPIE is validated through experiments, demonstrating its\ncapability as a foot and verifying the performance of the sensing mechanisms,\nideal models, and gated network-based models.\n","authors":["Yusuke Tanaka","Alvin Zhu","Richard Lin","Ankur Mehta","Dennis Hong"],"pdf_url":"https://arxiv.org/pdf/2410.17524v3.pdf","comment":"Proceeding to 2025 IEEE International Conference on Robotics and\n Automation (ICRA25)"},{"id":"http://arxiv.org/abs/2501.16590v1","updated":"2025-01-28T00:07:28Z","published":"2025-01-28T00:07:28Z","title":"Benchmarking Model Predictive Control and Reinforcement Learning Based\n Control for Legged Robot Locomotion in MuJoCo Simulation","summary":" Model Predictive Control (MPC) and Reinforcement Learning (RL) are two\nprominent strategies for controlling legged robots, each with unique strengths.\nRL learns control policies through system interaction, adapting to various\nscenarios, whereas MPC relies on a predefined mathematical model to solve\noptimization problems in real-time. Despite their widespread use, there is a\nlack of direct comparative analysis under standardized conditions. This work\naddresses this gap by benchmarking MPC and RL controllers on a Unitree Go1\nquadruped robot within the MuJoCo simulation environment, focusing on a\nstandardized task-straight walking at a constant velocity. Performance is\nevaluated based on disturbance rejection, energy efficiency, and terrain\nadaptability. The results show that RL excels in handling disturbances and\nmaintaining energy efficiency but struggles with generalization to new terrains\ndue to its dependence on learned policies tailored to specific environments. In\ncontrast, MPC shows enhanced recovery capabilities from larger perturbations by\nleveraging its optimization-based approach, allowing for a balanced\ndistribution of control efforts across the robot's joints. The results provide\na clear understanding of the advantages and limitations of both RL and MPC,\noffering insights into selecting an appropriate control strategy for legged\nrobotic applications.\n","authors":["Shivayogi Akki","Tan Chen"],"pdf_url":"https://arxiv.org/pdf/2501.16590v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.17162v1","updated":"2025-01-28T18:59:49Z","published":"2025-01-28T18:59:49Z","title":"CubeDiff: Repurposing Diffusion-Based Image Models for Panorama\n Generation","summary":" We introduce a novel method for generating 360{\\deg} panoramas from text\nprompts or images. Our approach leverages recent advances in 3D generation by\nemploying multi-view diffusion models to jointly synthesize the six faces of a\ncubemap. Unlike previous methods that rely on processing equirectangular\nprojections or autoregressive generation, our method treats each face as a\nstandard perspective image, simplifying the generation process and enabling the\nuse of existing multi-view diffusion models. We demonstrate that these models\ncan be adapted to produce high-quality cubemaps without requiring\ncorrespondence-aware attention layers. Our model allows for fine-grained text\ncontrol, generates high resolution panorama images and generalizes well beyond\nits training set, whilst achieving state-of-the-art results, both qualitatively\nand quantitatively. Project page: https://cubediff.github.io/\n","authors":["Nikolai Kalischek","Michael Oechsle","Fabian Manhardt","Philipp Henzler","Konrad Schindler","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2501.17162v1.pdf","comment":"Accepted at ICLR 2025"},{"id":"http://arxiv.org/abs/2501.17161v1","updated":"2025-01-28T18:59:44Z","published":"2025-01-28T18:59:44Z","title":"SFT Memorizes, RL Generalizes: A Comparative Study of Foundation Model\n Post-training","summary":" Supervised fine-tuning (SFT) and reinforcement learning (RL) are widely used\npost-training techniques for foundation models. However, their roles in\nenhancing model generalization capabilities remain unclear. This paper studies\nthe difference between SFT and RL on generalization and memorization, focusing\non text-based rule variants and visual variants. We introduce GeneralPoints, an\narithmetic reasoning card game, and adopt V-IRL, a real-world navigation\nenvironment, to assess how models trained with SFT and RL generalize to unseen\nvariants in both textual and visual domains. We show that RL, especially when\ntrained with an outcome-based reward, generalizes across both rule-based\ntextual and visual variants. SFT, in contrast, tends to memorize training data\nand struggles to generalize out-of-distribution scenarios. Further analysis\nreveals that RL improves the model's underlying visual recognition\ncapabilities, contributing to its enhanced generalization in the visual domain.\nDespite RL's superior generalization, we show that SFT remains essential for\neffective RL training; SFT stabilizes the model's output format, enabling\nsubsequent RL to achieve its performance gains. These findings demonstrates the\ncapability of RL for acquiring generalizable knowledge in complex, multi-modal\ntasks.\n","authors":["Tianzhe Chu","Yuexiang Zhai","Jihan Yang","Shengbang Tong","Saining Xie","Dale Schuurmans","Quoc V. Le","Sergey Levine","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2501.17161v1.pdf","comment":"Website at https://tianzhechu.com/SFTvsRL"},{"id":"http://arxiv.org/abs/2501.17160v1","updated":"2025-01-28T18:59:21Z","published":"2025-01-28T18:59:21Z","title":"A Hybrid Deep Learning CNN Model for Enhanced COVID-19 Detection from\n Computed Tomography (CT) Scan Images","summary":" Early detection of COVID-19 is crucial for effective treatment and\ncontrolling its spread. This study proposes a novel hybrid deep learning model\nfor detecting COVID-19 from CT scan images, designed to assist overburdened\nmedical professionals. Our proposed model leverages the strengths of VGG16,\nDenseNet121, and MobileNetV2 to extract features, followed by Principal\nComponent Analysis (PCA) for dimensionality reduction, after which the features\nare stacked and classified using a Support Vector Classifier (SVC). We\nconducted comparative analysis between the proposed hybrid model and individual\npre-trained CNN models, using a dataset of 2,108 training images and 373 test\nimages comprising both COVID-positive and non-COVID images. Our proposed hybrid\nmodel achieved an accuracy of 98.93%, outperforming the individual models in\nterms of precision, recall, F1 scores, and ROC curve performance.\n","authors":["Suresh Babu Nettur","Shanthi Karpurapu","Unnati Nettur","Likhit Sagar Gajja","Sravanthy Myneni","Akhil Dusi","Lalithya Posham"],"pdf_url":"https://arxiv.org/pdf/2501.17160v1.pdf","comment":"Corresponding authors: Shanthi Karpurapu\n (shanthi.karpurapu@gmail.com), Suresh Babu Nettur (nettursuresh@gmail.com)\n Shanthi Karpurapu and Suresh Babu Nettur are co-first authors"},{"id":"http://arxiv.org/abs/2501.17159v1","updated":"2025-01-28T18:59:03Z","published":"2025-01-28T18:59:03Z","title":"IC-Portrait: In-Context Matching for View-Consistent Personalized\n Portrait","summary":" Existing diffusion models show great potential for identity-preserving\ngeneration. However, personalized portrait generation remains challenging due\nto the diversity in user profiles, including variations in appearance and\nlighting conditions. To address these challenges, we propose IC-Portrait, a\nnovel framework designed to accurately encode individual identities for\npersonalized portrait generation. Our key insight is that pre-trained diffusion\nmodels are fast learners (e.g.,100 ~ 200 steps) for in-context dense\ncorrespondence matching, which motivates the two major designs of our\nIC-Portrait framework. Specifically, we reformulate portrait generation into\ntwo sub-tasks: 1) Lighting-Aware Stitching: we find that masking a high\nproportion of the input image, e.g., 80%, yields a highly effective\nself-supervisory representation learning of reference image lighting. 2)\nView-Consistent Adaptation: we leverage a synthetic view-consistent profile\ndataset to learn the in-context correspondence. The reference profile can then\nbe warped into arbitrary poses for strong spatial-aligned view conditioning.\nCoupling these two designs by simply concatenating latents to form\nControlNet-like supervision and modeling, enables us to significantly enhance\nthe identity preservation fidelity and stability. Extensive evaluations\ndemonstrate that IC-Portrait consistently outperforms existing state-of-the-art\nmethods both quantitatively and qualitatively, with particularly notable\nimprovements in visual qualities. Furthermore, IC-Portrait even demonstrates\n3D-aware relighting capabilities.\n","authors":["Han Yang","Enis Simsar","Sotiris Anagnostidi","Yanlong Zang","Thomas Hofmann","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.17159v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2410.14462v4","updated":"2025-01-28T18:35:41Z","published":"2024-10-18T13:44:29Z","title":"LUDVIG: Learning-free Uplifting of 2D Visual features to Gaussian\n Splatting scenes","summary":" We address the problem of extending the capabilities of vision foundation\nmodels such as DINO, SAM, and CLIP, to 3D tasks. Specifically, we introduce a\nnovel method to uplift 2D image features into Gaussian Splatting\nrepresentations of 3D scenes. Unlike traditional approaches that rely on\nminimizing a reconstruction loss, our method employs a simpler and more\nefficient feature aggregation technique, augmented by a graph diffusion\nmechanism. Graph diffusion refines 3D features, such as coarse segmentation\nmasks, by leveraging 3D geometry and pairwise similarities induced by DINOv2.\nOur approach achieves performance comparable to the state of the art on\nmultiple downstream tasks while delivering significant speed-ups. Notably, we\nobtain competitive segmentation results using generic DINOv2 features, despite\nDINOv2 not being trained on millions of annotated segmentation masks like SAM.\nWhen applied to CLIP features, our method demonstrates strong performance in\nopen-vocabulary object localization tasks, highlighting the versatility of our\napproach.\n","authors":["Juliette Marrie","Romain Menegaux","Michael Arbel","Diane Larlus","Julien Mairal"],"pdf_url":"https://arxiv.org/pdf/2410.14462v4.pdf","comment":"Project page: https://juliettemarrie.github.io/ludvig"},{"id":"http://arxiv.org/abs/2501.17131v1","updated":"2025-01-28T18:23:12Z","published":"2025-01-28T18:23:12Z","title":"Scenario Understanding of Traffic Scenes Through Large Visual Language\n Models","summary":" Deep learning models for autonomous driving, encompassing perception,\nplanning, and control, depend on vast datasets to achieve their high\nperformance. However, their generalization often suffers due to domain-specific\ndata distributions, making an effective scene-based categorization of samples\nnecessary to improve their reliability across diverse domains. Manual\ncaptioning, though valuable, is both labor-intensive and time-consuming,\ncreating a bottleneck in the data annotation process. Large Visual Language\nModels (LVLMs) present a compelling solution by automating image analysis and\ncategorization through contextual queries, often without requiring retraining\nfor new categories. In this study, we evaluate the capabilities of LVLMs,\nincluding GPT-4 and LLaVA, to understand and classify urban traffic scenes on\nboth an in-house dataset and the BDD100K. We propose a scalable captioning\npipeline that integrates state-of-the-art models, enabling a flexible\ndeployment on new datasets. Our analysis, combining quantitative metrics with\nqualitative insights, demonstrates the effectiveness of LVLMs to understand\nurban traffic scenarios and highlights their potential as an efficient tool for\ndata-driven advancements in autonomous driving.\n","authors":["Rivera Esteban","Lübberstedt Jannik","Nico Uhlemann","Markus Lienkamp"],"pdf_url":"https://arxiv.org/pdf/2501.17131v1.pdf","comment":"Accepted at WACV2025"},{"id":"http://arxiv.org/abs/2501.17099v1","updated":"2025-01-28T17:39:50Z","published":"2025-01-28T17:39:50Z","title":"Text-to-Image Generation for Vocabulary Learning Using the Keyword\n Method","summary":" The 'keyword method' is an effective technique for learning vocabulary of a\nforeign language. It involves creating a memorable visual link between what a\nword means and what its pronunciation in a foreign language sounds like in the\nlearner's native language. However, these memorable visual links remain\nimplicit in the people's mind and are not easy to remember for a large set of\nwords. To enhance the memorisation and recall of the vocabulary, we developed\nan application that combines the keyword method with text-to-image generators\nto externalise the memorable visual links into visuals. These visuals represent\nadditional stimuli during the memorisation process. To explore the\neffectiveness of this approach we first run a pilot study to investigate how\ndifficult it is to externalise the descriptions of mental visualisations of\nmemorable links, by asking participants to write them down. We used these\ndescriptions as prompts for text-to-image generator (DALL-E2) to convert them\ninto images and asked participants to select their favourites. Next, we\ncompared different text-to-image generators (DALL-E2, Midjourney, Stable and\nLatent Diffusion) to evaluate the perceived quality of the generated images by\neach. Despite heterogeneous results, participants mostly preferred images\ngenerated by DALL-E2, which was used also for the final study. In this study,\nwe investigated whether providing such images enhances the retention of\nvocabulary being learned, compared to the keyword method only. Our results\nindicate that people did not encounter difficulties describing their\nvisualisations of memorable links and that providing corresponding images\nsignificantly improves memory retention.\n","authors":["Nuwan T. Attygalle","Matjaž Kljun","Aaron Quigley","Klen čOpič Pucihar","Jens Grubert","Verena Biener","Luis A. Leiva","Juri Yoneyama","Alice Toniolo","Angela Miguel","Hirokazu Kato","Maheshya Weerasinghe"],"pdf_url":"https://arxiv.org/pdf/2501.17099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18213v3","updated":"2025-01-28T17:28:47Z","published":"2024-05-28T14:17:41Z","title":"NeRAF: 3D Scene Infused Neural Radiance and Acoustic Fields","summary":" Sound plays a major role in human perception. Along with vision, it provides\nessential information for understanding our surroundings. Despite advances in\nneural implicit representations, learning acoustics that align with visual\nscenes remains a challenge. We propose NeRAF, a method that jointly learns\nacoustic and radiance fields. NeRAF synthesizes both novel views and\nspatialized room impulse responses (RIR) at new positions by conditioning the\nacoustic field on 3D scene geometric and appearance priors from the radiance\nfield. The generated RIR can be applied to auralize any audio signal. Each\nmodality can be rendered independently and at spatially distinct positions,\noffering greater versatility. We demonstrate that NeRAF generates high-quality\naudio on SoundSpaces and RAF datasets, achieving significant performance\nimprovements over prior methods while being more data-efficient. Additionally,\nNeRAF enhances novel view synthesis of complex scenes trained with sparse data\nthrough cross-modal learning. NeRAF is designed as a Nerfstudio module,\nproviding convenient access to realistic audio-visual generation.\n","authors":["Amandine Brunetto","Sascha Hornauer","Fabien Moutarde"],"pdf_url":"https://arxiv.org/pdf/2405.18213v3.pdf","comment":"Project Page: https://amandinebtto.github.io/NeRAF"},{"id":"http://arxiv.org/abs/2501.17085v1","updated":"2025-01-28T17:14:13Z","published":"2025-01-28T17:14:13Z","title":"Evaluating CrowdSplat: Perceived Level of Detail for Gaussian Crowds","summary":" Efficient and realistic crowd rendering is an important element of many\nreal-time graphics applications such as Virtual Reality (VR) and games. To this\nend, Levels of Detail (LOD) avatar representations such as polygonal meshes,\nimage-based impostors, and point clouds have been proposed and evaluated. More\nrecently, 3D Gaussian Splatting has been explored as a potential method for\nreal-time crowd rendering. In this paper, we present a two-alternative forced\nchoice (2AFC) experiment that aims to determine the perceived quality of 3D\nGaussian avatars. Three factors were explored: Motion, LOD (i.e., #Gaussians),\nand the avatar height in Pixels (corresponding to the viewing distance).\nParticipants viewed pairs of animated 3D Gaussian avatars and were tasked with\nchoosing the most detailed one. Our findings can inform the optimization of LOD\nstrategies in Gaussian-based crowd rendering, thereby helping to achieve\nefficient rendering while maintaining visual quality in real-time applications.\n","authors":["Xiaohan Sun","Yinghan Xu","John Dingliana","Carol O'Sullivan"],"pdf_url":"https://arxiv.org/pdf/2501.17085v1.pdf","comment":"5 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.16239v2","updated":"2025-01-28T17:09:41Z","published":"2025-01-27T17:35:39Z","title":"Distilling foundation models for robust and efficient models in digital\n pathology","summary":" In recent years, the advent of foundation models (FM) for digital pathology\nhas relied heavily on scaling the pre-training datasets and the model size,\nyielding large and powerful models. While it resulted in improving the\nperformance on diverse downstream tasks, it also introduced increased\ncomputational cost and inference time. In this work, we explore the\ndistillation of a large foundation model into a smaller one, reducing the\nnumber of parameters by several orders of magnitude. Leveraging distillation\ntechniques, our distilled model, H0-mini, achieves nearly comparable\nperformance to large FMs at a significantly reduced inference cost. It is\nevaluated on several public benchmarks, achieving 3rd place on the HEST\nbenchmark and 5th place on the EVA benchmark. Additionally, a robustness\nanalysis conducted on the PLISM dataset demonstrates that our distilled model\nreaches excellent robustness to variations in staining and scanning conditions,\nsignificantly outperforming other state-of-the art models. This opens new\nperspectives to design lightweight and robust models for digital pathology,\nwithout compromising on performance.\n","authors":["Alexandre Filiot","Nicolas Dop","Oussama Tchita","Auriane Riou","Rémy Dubois","Thomas Peeters","Daria Valter","Marin Scalbert","Charlie Saillard","Geneviève Robin","Antoine Olivier"],"pdf_url":"https://arxiv.org/pdf/2501.16239v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2501.17076v1","updated":"2025-01-28T17:01:42Z","published":"2025-01-28T17:01:42Z","title":"DINOSTAR: Deep Iterative Neural Object Detector Self-Supervised Training\n for Roadside LiDAR Applications","summary":" Recent advancements in deep-learning methods for object detection in\npoint-cloud data have enabled numerous roadside applications, fostering\nimprovements in transportation safety and management. However, the intricate\nnature of point-cloud data poses significant challenges for human-supervised\nlabeling, resulting in substantial expenditures of time and capital. This paper\naddresses the issue by developing an end-to-end, scalable, and self-supervised\nframework for training deep object detectors tailored for roadside point-cloud\ndata. The proposed framework leverages self-supervised, statistically modeled\nteachers to train off-the-shelf deep object detectors, thus circumventing the\nneed for human supervision. The teacher models follow fine-tuned set standard\npractices of background filtering, object clustering, bounding-box fitting, and\nclassification to generate noisy labels. It is presented that by training the\nstudent model over the combined noisy annotations from multitude of teachers\nenhances its capacity to discern background/foreground more effectively and\nforces it to learn diverse point-cloud-representations for object categories of\ninterest. The evaluations, involving publicly available roadside datasets and\nstate-of-art deep object detectors, demonstrate that the proposed framework\nachieves comparable performance to deep object detectors trained on\nhuman-annotated labels, despite not utilizing such human-annotations in its\ntraining process.\n","authors":["Muhammad Shahbaz","Shaurya Agarwal"],"pdf_url":"https://arxiv.org/pdf/2501.17076v1.pdf","comment":"conference, 6 pages"},{"id":"http://arxiv.org/abs/2501.11733v2","updated":"2025-01-28T16:58:02Z","published":"2025-01-20T20:35:46Z","title":"Mobile-Agent-E: Self-Evolving Mobile Assistant for Complex Tasks","summary":" Smartphones have become indispensable in modern life, yet navigating complex\ntasks on mobile devices often remains frustrating. Recent advancements in large\nmultimodal model (LMM)-based mobile agents have demonstrated the ability to\nperceive and act in mobile environments. However, current approaches face\nsignificant limitations: they fall short in addressing real-world human needs,\nstruggle with reasoning-intensive and long-horizon tasks, and lack mechanisms\nto learn and improve from prior experiences. To overcome these challenges, we\nintroduce Mobile-Agent-E, a hierarchical multi-agent framework capable of\nself-evolution through past experience. By hierarchical, we mean an explicit\nseparation of high-level planning and low-level action execution. The framework\ncomprises a Manager, responsible for devising overall plans by breaking down\ncomplex tasks into subgoals, and four subordinate agents--Perceptor, Operator,\nAction Reflector, and Notetaker--which handle fine-grained visual perception,\nimmediate action execution, error verification, and information aggregation,\nrespectively. Mobile-Agent-E also features a novel self-evolution module which\nmaintains a persistent long-term memory comprising Tips and Shortcuts. Tips are\ngeneral guidance and lessons learned from prior tasks on how to effectively\ninteract with the environment. Shortcuts are reusable, executable sequences of\natomic operations tailored for specific subroutines. The inclusion of Tips and\nShortcuts facilitates continuous refinement in performance and efficiency.\nAlongside this framework, we introduce Mobile-Eval-E, a new benchmark featuring\ncomplex mobile tasks requiring long-horizon, multi-app interactions. Empirical\nresults show that Mobile-Agent-E achieves a 22% absolute improvement over\nprevious state-of-the-art approaches across three foundation model backbones.\nProject page: https://x-plug.github.io/MobileAgent.\n","authors":["Zhenhailong Wang","Haiyang Xu","Junyang Wang","Xi Zhang","Ming Yan","Ji Zhang","Fei Huang","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2501.11733v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.17062v1","updated":"2025-01-28T16:40:40Z","published":"2025-01-28T16:40:40Z","title":"EdgeMLOps: Operationalizing ML models with Cumulocity IoT and\n thin-edge.io for Visual quality Inspection","summary":" This paper introduces EdgeMLOps, a framework leveraging Cumulocity IoT and\nthin-edge.io for deploying and managing machine learning models on\nresource-constrained edge devices. We address the challenges of model\noptimization, deployment, and lifecycle management in edge environments. The\nframework's efficacy is demonstrated through a visual quality inspection (VQI)\nuse case where images of assets are processed on edge devices, enabling\nreal-time condition updates within an asset management system. Furthermore, we\nevaluate the performance benefits of different quantization methods,\nspecifically static and dynamic signed-int8, on a Raspberry Pi 4, demonstrating\nsignificant inference time reductions compared to FP32 precision. Our results\nhighlight the potential of EdgeMLOps to enable efficient and scalable AI\ndeployments at the edge for industrial applications.\n","authors":["Kanishk Chaturvedi","Johannes Gasthuber","Mohamed Abdelaal"],"pdf_url":"https://arxiv.org/pdf/2501.17062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.17053v1","updated":"2025-01-28T16:25:10Z","published":"2025-01-28T16:25:10Z","title":"Contextual Self-paced Learning for Weakly Supervised Spatio-Temporal\n Video Grounding","summary":" In this work, we focus on Weakly Supervised Spatio-Temporal Video Grounding\n(WSTVG). It is a multimodal task aimed at localizing specific subjects\nspatio-temporally based on textual queries without bounding box supervision.\nMotivated by recent advancements in multi-modal foundation models for grounding\ntasks, we first explore the potential of state-of-the-art object detection\nmodels for WSTVG. Despite their robust zero-shot capabilities, our adaptation\nreveals significant limitations, including inconsistent temporal predictions,\ninadequate understanding of complex queries, and challenges in adapting to\ndifficult scenarios. We propose CoSPaL (Contextual Self-Paced Learning), a\nnovel approach which is designed to overcome these limitations. CoSPaL\nintegrates three core components: (1) Tubelet Phrase Grounding (TPG), which\nintroduces spatio-temporal prediction by linking textual queries to tubelets;\n(2) Contextual Referral Grounding (CRG), which improves comprehension of\ncomplex queries by extracting contextual information to refine object\nidentification over time; and (3) Self-Paced Scene Understanding (SPS), a\ntraining paradigm that progressively increases task difficulty, enabling the\nmodel to adapt to complex scenarios by transitioning from coarse to\nfine-grained understanding.\n","authors":["Akash Kumar","Zsolt Kira","Yogesh Singh Rawat"],"pdf_url":"https://arxiv.org/pdf/2501.17053v1.pdf","comment":"ICLR'25 Main Conference. Project Page:\n https://akash2907.github.io/cospal_webpage"},{"id":"http://arxiv.org/abs/2501.17044v1","updated":"2025-01-28T16:09:34Z","published":"2025-01-28T16:09:34Z","title":"Synthesizing 3D Abstractions by Inverting Procedural Buildings with\n Transformers","summary":" We generate abstractions of buildings, reflecting the essential aspects of\ntheir geometry and structure, by learning to invert procedural models. We first\nbuild a dataset of abstract procedural building models paired with simulated\npoint clouds and then learn the inverse mapping through a transformer. Given a\npoint cloud, the trained transformer then infers the corresponding abstracted\nbuilding in terms of a programmatic language description. This approach\nleverages expressive procedural models developed for gaming and animation, and\nthereby retains desirable properties such as efficient rendering of the\ninferred abstractions and strong priors for regularity and symmetry. Our\napproach achieves good reconstruction accuracy in terms of geometry and\nstructure, as well as structurally consistent inpainting.\n","authors":["Max Dax","Jordi Berbel","Jan Stria","Leonidas Guibas","Urs Bergmann"],"pdf_url":"https://arxiv.org/pdf/2501.17044v1.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2410.07688v2","updated":"2025-01-28T16:02:30Z","published":"2024-10-10T07:54:17Z","title":"PokeFlex: A Real-World Dataset of Volumetric Deformable Objects for\n Robotics","summary":" Data-driven methods have shown great potential in solving challenging\nmanipulation tasks; however, their application in the domain of deformable\nobjects has been constrained, in part, by the lack of data. To address this\nlack, we propose PokeFlex, a dataset featuring real-world multimodal data that\nis paired and annotated. The modalities include 3D textured meshes, point\nclouds, RGB images, and depth maps. Such data can be leveraged for several\ndownstream tasks, such as online 3D mesh reconstruction, and it can potentially\nenable underexplored applications such as the real-world deployment of\ntraditional control methods based on mesh simulations. To deal with the\nchallenges posed by real-world 3D mesh reconstruction, we leverage a\nprofessional volumetric capture system that allows complete 360{\\deg}\nreconstruction. PokeFlex consists of 18 deformable objects with varying\nstiffness and shapes. Deformations are generated by dropping objects onto a\nflat surface or by poking the objects with a robot arm. Interaction wrenches\nand contact locations are also reported for the latter case. Using different\ndata modalities, we demonstrated a use case for our dataset training models\nthat, given the novelty of the multimodal nature of Pokeflex, constitute the\nstate-of-the-art in multi-object online template-based mesh reconstruction from\nmultimodal data, to the best of our knowledge. We refer the reader to our\nwebsite ( https://pokeflex-dataset.github.io/ ) for further demos and examples.\n","authors":["Jan Obrist","Miguel Zamora","Hehui Zheng","Ronan Hinchet","Firat Ozdemir","Juan Zarate","Robert K. Katzschmann","Stelian Coros"],"pdf_url":"https://arxiv.org/pdf/2410.07688v2.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2501.15371v2","updated":"2025-01-28T15:56:14Z","published":"2025-01-26T02:52:46Z","title":"Acquiring Submillimeter-Accurate Multi-Task Vision Datasets for\n Computer-Assisted Orthopedic Surgery","summary":" Advances in computer vision, particularly in optical image-based 3D\nreconstruction and feature matching, enable applications like marker-less\nsurgical navigation and digitization of surgery. However, their development is\nhindered by a lack of suitable datasets with 3D ground truth. This work\nexplores an approach to generating realistic and accurate ex vivo datasets\ntailored for 3D reconstruction and feature matching in open orthopedic surgery.\nA set of posed images and an accurately registered ground truth surface mesh of\nthe scene are required to develop vision-based 3D reconstruction and matching\nmethods suitable for surgery. We propose a framework consisting of three core\nsteps and compare different methods for each step: 3D scanning, calibration of\nviewpoints for a set of high-resolution RGB images, and an optical-based method\nfor scene registration. We evaluate each step of this framework on an ex vivo\nscoliosis surgery using a pig spine, conducted under real operating room\nconditions. A mean 3D Euclidean error of 0.35 mm is achieved with respect to\nthe 3D ground truth. The proposed method results in submillimeter accurate 3D\nground truths and surgical images with a spatial resolution of 0.1 mm. This\nopens the door to acquiring future surgical datasets for high-precision\napplications.\n","authors":["Emma Most","Jonas Hein","Frédéric Giraud","Nicola A. Cavalcanti","Lukas Zingg","Baptiste Brument","Nino Louman","Fabio Carrillo","Philipp Fürnstahl","Lilian Calvet"],"pdf_url":"https://arxiv.org/pdf/2501.15371v2.pdf","comment":"18 pages, 12 figures. Submitted to the 16th International Conference\n on Information Processing in Computer-Assisted Interventions (IPCAI 2025)"},{"id":"http://arxiv.org/abs/2501.16997v1","updated":"2025-01-28T14:52:10Z","published":"2025-01-28T14:52:10Z","title":"MAUCell: An Adaptive Multi-Attention Framework for Video Frame\n Prediction","summary":" Temporal sequence modeling stands as the fundamental foundation for video\nprediction systems and real-time forecasting operations as well as anomaly\ndetection applications. The achievement of accurate predictions through\nefficient resource consumption remains an ongoing issue in contemporary\ntemporal sequence modeling. We introduce the Multi-Attention Unit (MAUCell)\nwhich combines Generative Adversarial Networks (GANs) and spatio-temporal\nattention mechanisms to improve video frame prediction capabilities. Our\napproach implements three types of attention models to capture intricate motion\nsequences. A dynamic combination of these attention outputs allows the model to\nreach both advanced decision accuracy along with superior quality while\nremaining computationally efficient. The integration of GAN elements makes\ngenerated frames appear more true to life therefore the framework creates\noutput sequences which mimic real-world footage. The new design system\nmaintains equilibrium between temporal continuity and spatial accuracy to\ndeliver reliable video prediction. Through a comprehensive evaluation\nmethodology which merged the perceptual LPIPS measurement together with classic\ntests MSE, MAE, SSIM and PSNR exhibited enhancing capabilities than\ncontemporary approaches based on direct benchmark tests of Moving MNIST, KTH\nAction, and CASIA-B (Preprocessed) datasets. Our examination indicates that\nMAUCell shows promise for operational time requirements. The research findings\ndemonstrate how GANs work best with attention mechanisms to create better\napplications for predicting video sequences.\n","authors":["Shreyam Gupta","P. Agrawal","Priyam Gupta"],"pdf_url":"https://arxiv.org/pdf/2501.16997v1.pdf","comment":"This work has been submitted to the IJCAI 2025 Conference for review.\n It contains: 11 pages, 4 figures, 7 tables, and 3 Algorithms"},{"id":"http://arxiv.org/abs/2501.16992v1","updated":"2025-01-28T14:46:38Z","published":"2025-01-28T14:46:38Z","title":"FedEFM: Federated Endovascular Foundation Model with Unseen Data","summary":" In endovascular surgery, the precise identification of catheters and\nguidewires in X-ray images is essential for reducing intervention risks.\nHowever, accurately segmenting catheter and guidewire structures is challenging\ndue to the limited availability of labeled data. Foundation models offer a\npromising solution by enabling the collection of similar domain data to train\nmodels whose weights can be fine-tuned for downstream tasks. Nonetheless,\nlarge-scale data collection for training is constrained by the necessity of\nmaintaining patient privacy. This paper proposes a new method to train a\nfoundation model in a decentralized federated learning setting for endovascular\nintervention. To ensure the feasibility of the training, we tackle the unseen\ndata issue using differentiable Earth Mover's Distance within a knowledge\ndistillation framework. Once trained, our foundation model's weights provide\nvaluable initialization for downstream tasks, thereby enhancing task-specific\nperformance. Intensive experiments show that our approach achieves new\nstate-of-the-art results, contributing to advancements in endovascular\nintervention and robotic-assisted endovascular surgery, while addressing the\ncritical issue of data sharing in the medical domain.\n","authors":["Tuong Do","Nghia Vu","Tudor Jianu","Baoru Huang","Minh Vu","Jionglong Su","Erman Tjiputra","Quang D. Tran","Te-Chuan Chiu","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.16992v1.pdf","comment":"8 pages. Accepted to ICRA 2025"},{"id":"http://arxiv.org/abs/2412.15050v3","updated":"2025-01-28T14:33:42Z","published":"2024-12-19T16:57:45Z","title":"Uni-Renderer: Unifying Rendering and Inverse Rendering Via Dual Stream\n Diffusion","summary":" Rendering and inverse rendering are pivotal tasks in both computer vision and\ngraphics. The rendering equation is the core of the two tasks, as an ideal\nconditional distribution transfer function from intrinsic properties to RGB\nimages. Despite achieving promising results of existing rendering methods, they\nmerely approximate the ideal estimation for a specific scene and come with a\nhigh computational cost. Additionally, the inverse conditional distribution\ntransfer is intractable due to the inherent ambiguity. To address these\nchallenges, we propose a data-driven method that jointly models rendering and\ninverse rendering as two conditional generation tasks within a single diffusion\nframework. Inspired by UniDiffuser, we utilize two distinct time schedules to\nmodel both tasks, and with a tailored dual streaming module, we achieve\ncross-conditioning of two pre-trained diffusion models. This unified approach,\nnamed Uni-Renderer, allows the two processes to facilitate each other through a\ncycle-consistent constrain, mitigating ambiguity by enforcing consistency\nbetween intrinsic properties and rendered images. Combined with a meticulously\nprepared dataset, our method effectively decomposition of intrinsic properties\nand demonstrates a strong capability to recognize changes during rendering. We\nwill open-source our training and inference code to the public, fostering\nfurther research and development in this area.\n","authors":["Zhifei Chen","Tianshuo Xu","Wenhang Ge","Leyi Wu","Dongyu Yan","Jing He","Luozhou Wang","Lu Zeng","Shunsi Zhang","Yingcong Chen","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2412.15050v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16981v1","updated":"2025-01-28T14:28:55Z","published":"2025-01-28T14:28:55Z","title":"Modulating CNN Features with Pre-Trained ViT Representations for\n Open-Vocabulary Object Detection","summary":" Owing to large-scale image-text contrastive training, pre-trained vision\nlanguage model (VLM) like CLIP shows superior open-vocabulary recognition\nability. Most existing open-vocabulary object detectors attempt to utilize the\npre-trained VLM to attain generative representation. F-ViT uses the pre-trained\nvisual encoder as the backbone network and freezes it during training. However,\nthe frozen backbone doesn't benefit from the labeled data to strengthen the\nrepresentation. Therefore, we propose a novel two-branch backbone network\ndesign, named as ViT-Feature-Modulated Multi-Scale Convolutional network\n(VMCNet). VMCNet consists of a trainable convolutional branch, a frozen\npre-trained ViT branch and a feature modulation module. The trainable CNN\nbranch could be optimized with labeled data while the frozen pre-trained ViT\nbranch could keep the representation ability derived from large-scale\npre-training. Then, the proposed feature modulation module could modulate the\nmulti-scale CNN features with the representations from ViT branch. With the\nproposed mixed structure, detector is more likely to discover novel categories.\nEvaluated on two popular benchmarks, our method boosts the detection\nperformance on novel category and outperforms the baseline. On OV-COCO, the\nproposed method achieves 44.3 AP$_{50}^{\\mathrm{novel}}$ with ViT-B/16 and 48.5\nAP$_{50}^{\\mathrm{novel}}$ with ViT-L/14. On OV-LVIS, VMCNet with ViT-B/16 and\nViT-L/14 reaches 27.8 and 38.4 mAP$_{r}$.\n","authors":["Xiangyu Gao","Yu Dai","Benliu Qiu","Hongliang Li"],"pdf_url":"https://arxiv.org/pdf/2501.16981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16971v1","updated":"2025-01-28T14:13:17Z","published":"2025-01-28T14:13:17Z","title":"RODEO: Robust Outlier Detection via Exposing Adaptive\n Out-of-Distribution Samples","summary":" In recent years, there have been significant improvements in various forms of\nimage outlier detection. However, outlier detection performance under\nadversarial settings lags far behind that in standard settings. This is due to\nthe lack of effective exposure to adversarial scenarios during training,\nespecially on unseen outliers, leading to detection models failing to learn\nrobust features. To bridge this gap, we introduce RODEO, a data-centric\napproach that generates effective outliers for robust outlier detection. More\nspecifically, we show that incorporating outlier exposure (OE) and adversarial\ntraining can be an effective strategy for this purpose, as long as the exposed\ntraining outliers meet certain characteristics, including diversity, and both\nconceptual differentiability and analogy to the inlier samples. We leverage a\ntext-to-image model to achieve this goal. We demonstrate both quantitatively\nand qualitatively that our adaptive OE method effectively generates ``diverse''\nand ``near-distribution'' outliers, leveraging information from both text and\nimage domains. Moreover, our experimental results show that utilizing our\nsynthesized outliers significantly enhances the performance of the outlier\ndetector, particularly in adversarial settings.\n","authors":["Hossein Mirzaei","Mohammad Jafari","Hamid Reza Dehbashi","Ali Ansari","Sepehr Ghobadi","Masoud Hadi","Arshia Soltani Moakhar","Mohammad Azizmalayeri","Mahdieh Soleymani Baghshah","Mohammad Hossein Rohban"],"pdf_url":"https://arxiv.org/pdf/2501.16971v1.pdf","comment":"Accepted at the Forty-First International Conference on Machine\n Learning (ICML) 2024. The implementation of our work is available at:\n \\url{https://github.com/rohban-lab/RODEO}"},{"id":"http://arxiv.org/abs/2411.15390v3","updated":"2025-01-28T14:12:39Z","published":"2024-11-23T00:09:42Z","title":"The Hatching-Box: A Novel System for Automated Monitoring and\n Quantification of Drosophila melanogaster Developmental Behavior","summary":" In this paper we propose the Hatching-Box, a novel imaging and analysis\nsystem to automatically monitor and quantify the developmental behavior of\nDrosophila in standard rearing vials and during regular rearing routines,\nrendering explicit experiments obsolete. This is achieved by combining custom\ntailored imaging hardware with dedicated detection and tracking algorithms,\nenabling the quantification of larvae, filled/empty pupae and flies over\nmultiple days. Given the affordable and reproducible design of the Hatching-Box\nin combination with our generic client/server-based software, the system can\neasily be scaled to monitor an arbitrary amount of rearing vials\nsimultaneously. We evaluated our system on a curated image dataset comprising\nnearly 470,000 annotated objects and performed several studies on real world\nexperiments. We successfully reproduced results from well-established circadian\nexperiments by comparing the eclosion periods of wild type flies to the clock\nmutants $\\textit{per}^{short}$, $\\textit{per}^{long}$ and $\\textit{per}^0$\nwithout involvement of any manual labor. Furthermore we show, that the\nHatching-Box is able to extract additional information about group behavior as\nwell as to reconstruct the whole life-cycle of the individual specimens. These\nresults not only demonstrate the applicability of our system for long-term\nexperiments but also indicate its benefits for automated monitoring in the\ngeneral cultivation process.\n","authors":["Julian Bigge","Maite Ogueta","Luis Garcia","Benjamin Risse"],"pdf_url":"https://arxiv.org/pdf/2411.15390v3.pdf","comment":"17 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.16969v1","updated":"2025-01-28T14:12:32Z","published":"2025-01-28T14:12:32Z","title":"What Really Matters for Learning-based LiDAR-Camera Calibration","summary":" Calibration is an essential prerequisite for the accurate data fusion of\nLiDAR and camera sensors. Traditional calibration techniques often require\nspecific targets or suitable scenes to obtain reliable 2D-3D correspondences.\nTo tackle the challenge of target-less and online calibration, deep neural\nnetworks have been introduced to solve the problem in a data-driven manner.\nWhile previous learning-based methods have achieved impressive performance on\nspecific datasets, they still struggle in complex real-world scenarios. Most\nexisting works focus on improving calibration accuracy but overlook the\nunderlying mechanisms. In this paper, we revisit the development of\nlearning-based LiDAR-Camera calibration and encourage the community to pay more\nattention to the underlying principles to advance practical applications. We\nsystematically analyze the paradigm of mainstream learning-based methods, and\nidentify the critical limitations of regression-based methods with the widely\nused data generation pipeline. Our findings reveal that most learning-based\nmethods inadvertently operate as retrieval networks, focusing more on\nsingle-modality distributions rather than cross-modality correspondences. We\nalso investigate how the input data format and preprocessing operations impact\nnetwork performance and summarize the regression clues to inform further\nimprovements.\n","authors":["Shujuan Huang","Chunyu Lin","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.16969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16947v1","updated":"2025-01-28T13:46:01Z","published":"2025-01-28T13:46:01Z","title":"Image-based Geo-localization for Robotics: Are Black-box Vision-Language\n Models there yet?","summary":" The advances in Vision-Language models (VLMs) offer exciting opportunities\nfor robotic applications involving image geo-localization, the problem of\nidentifying the geo-coordinates of a place based on visual data only. Recent\nresearch works have focused on using a VLM as embeddings extractor for\ngeo-localization, however, the most sophisticated VLMs may only be available as\nblack boxes that are accessible through an API, and come with a number of\nlimitations: there is no access to training data, model features and gradients;\nretraining is not possible; the number of predictions may be limited by the\nAPI; training on model outputs is often prohibited; and queries are open-ended.\nThe utilization of a VLM as a stand-alone, zero-shot geo-localization system\nusing a single text-based prompt is largely unexplored. To bridge this gap,\nthis paper undertakes the first systematic study, to the best of our knowledge,\nto investigate the potential of some of the state-of-the-art VLMs as\nstand-alone, zero-shot geo-localization systems in a black-box setting with\nrealistic constraints. We consider three main scenarios for this thorough\ninvestigation: a) fixed text-based prompt; b) semantically-equivalent\ntext-based prompts; and c) semantically-equivalent query images. We also take\ninto account the auto-regressive and probabilistic generation process of the\nVLMs when investigating their utility for geo-localization task by using model\nconsistency as a metric in addition to traditional accuracy. Our work provides\nnew insights in the capabilities of different VLMs for the above-mentioned\nscenarios.\n","authors":["Sania Waheed","Bruno Ferrarini","Michael Milford","Sarvapali D. Ramchurn","Shoaib Ehsan"],"pdf_url":"https://arxiv.org/pdf/2501.16947v1.pdf","comment":"Submitted to IROS 2025"},{"id":"http://arxiv.org/abs/2501.16917v1","updated":"2025-01-28T13:01:41Z","published":"2025-01-28T13:01:41Z","title":"B-FPGM: Lightweight Face Detection via Bayesian-Optimized Soft FPGM\n Pruning","summary":" Face detection is a computer vision application that increasingly demands\nlightweight models to facilitate deployment on devices with limited\ncomputational resources. Neural network pruning is a promising technique that\ncan effectively reduce network size without significantly affecting\nperformance. In this work, we propose a novel face detection pruning pipeline\nthat leverages Filter Pruning via Geometric Median (FPGM) pruning, Soft Filter\nPruning (SFP) and Bayesian optimization in order to achieve a superior\ntrade-off between size and performance compared to existing approaches. FPGM\npruning is a structured pruning technique that allows pruning the least\nsignificant filters in each layer, while SFP iteratively prunes the filters and\nallows them to be updated in any subsequent training step. Bayesian\noptimization is employed in order to optimize the pruning rates of each layer,\nrather than relying on engineering expertise to determine the optimal pruning\nrates for each layer. In our experiments across all three subsets of the WIDER\nFACE dataset, our proposed approach B-FPGM consistently outperforms existing\nones in balancing model size and performance. All our experiments were applied\nto EResFD, the currently smallest (in number of parameters) well-performing\nface detector of the literature; a small ablation study with a second small\nface detector, EXTD, is also reported. The source code and trained pruned face\ndetection models can be found at: https://github.com/IDTITI/B-FPGM.\n","authors":["Nikolaos Kaparinos","Vasileios Mezaris"],"pdf_url":"https://arxiv.org/pdf/2501.16917v1.pdf","comment":"Accepted for publication, RWS Workshop @ IEEE/CVF Winter Conference\n on Applications of Computer Vision (WACV 2025), Tucson, AZ, USA, Feb. 2025.\n This is the authors' \"accepted version\""},{"id":"http://arxiv.org/abs/2308.14409v3","updated":"2025-01-28T12:59:17Z","published":"2023-08-28T08:47:06Z","title":"Steerable Conditional Diffusion for Out-of-Distribution Adaptation in\n Medical Image Reconstruction","summary":" Denoising diffusion models have emerged as the go-to generative framework for\nsolving inverse problems in imaging. A critical concern regarding these models\nis their performance on out-of-distribution tasks, which remains an\nunder-explored challenge. Using a diffusion model on an out-of-distribution\ndataset, realistic reconstructions can be generated, but with hallucinating\nimage features that are uniquely present in the training dataset. To address\nthis discrepancy during train-test time and improve reconstruction accuracy, we\nintroduce a novel sampling framework called Steerable Conditional Diffusion.\nSpecifically, this framework adapts the diffusion model, concurrently with\nimage reconstruction, based solely on the information provided by the available\nmeasurement. Utilising our proposed method, we achieve substantial enhancements\nin out-of-distribution performance across diverse imaging modalities, advancing\nthe robust deployment of denoising diffusion models in real-world applications.\n","authors":["Riccardo Barbano","Alexander Denker","Hyungjin Chung","Tae Hoon Roh","Simon Arridge","Peter Maass","Bangti Jin","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2308.14409v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16312v2","updated":"2025-01-28T12:52:41Z","published":"2025-01-27T18:49:38Z","title":"LinPrim: Linear Primitives for Differentiable Volumetric Rendering","summary":" Volumetric rendering has become central to modern novel view synthesis\nmethods, which use differentiable rendering to optimize 3D scene\nrepresentations directly from observed views. While many recent works build on\nNeRF or 3D Gaussians, we explore an alternative volumetric scene\nrepresentation. More specifically, we introduce two new scene representations\nbased on linear primitives-octahedra and tetrahedra-both of which define\nhomogeneous volumes bounded by triangular faces. This formulation aligns\nnaturally with standard mesh-based tools, minimizing overhead for downstream\napplications. To optimize these primitives, we present a differentiable\nrasterizer that runs efficiently on GPUs, allowing end-to-end gradient-based\noptimization while maintaining realtime rendering capabilities. Through\nexperiments on real-world datasets, we demonstrate comparable performance to\nstate-of-the-art volumetric methods while requiring fewer primitives to achieve\nsimilar reconstruction fidelity. Our findings provide insights into the\ngeometry of volumetric rendering and suggest that adopting explicit polyhedra\ncan expand the design space of scene representations.\n","authors":["Nicolas von Lützow","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2501.16312v2.pdf","comment":"Project page: https://nicolasvonluetzow.github.io/LinPrim ; Project\n video: https://youtu.be/P2yeHwmGaeM"},{"id":"http://arxiv.org/abs/2501.16904v1","updated":"2025-01-28T12:44:27Z","published":"2025-01-28T12:44:27Z","title":"Adversarial Masked Autoencoder Purifier with Defense Transferability","summary":" The study of adversarial defense still struggles to combat with advanced\nadversarial attacks. In contrast to most prior studies that rely on the\ndiffusion model for test-time defense to remarkably increase the inference\ntime, we propose Masked AutoEncoder Purifier (MAEP), which integrates Masked\nAutoEncoder (MAE) into an adversarial purifier framework for test-time\npurification. While MAEP achieves promising adversarial robustness, it\nparticularly features model defense transferability and attack generalization\nwithout relying on using additional data that is different from the training\ndataset. To our knowledge, MAEP is the first study of adversarial purifier\nbased on MAE. Extensive experimental results demonstrate that our method can\nnot only maintain clear accuracy with only a slight drop but also exhibit a\nclose gap between the clean and robust accuracy. Notably, MAEP trained on\nCIFAR10 achieves state-of-the-art performance even when tested directly on\nImageNet, outperforming existing diffusion-based models trained specifically on\nImageNet.\n","authors":["Yuan-Chih Chen","Chun-Shien Lu"],"pdf_url":"https://arxiv.org/pdf/2501.16904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16896v1","updated":"2025-01-28T12:27:25Z","published":"2025-01-28T12:27:25Z","title":"Frequency Matters: Explaining Biases of Face Recognition in the\n Frequency Domain","summary":" Face recognition (FR) models are vulnerable to performance variations across\ndemographic groups. The causes for these performance differences are unclear\ndue to the highly complex deep learning-based structure of face recognition\nmodels. Several works aimed at exploring possible roots of gender and ethnicity\nbias, identifying semantic reasons such as hairstyle, make-up, or facial hair\nas possible sources. Motivated by recent discoveries of the importance of\nfrequency patterns in convolutional neural networks, we explain bias in face\nrecognition using state-of-the-art frequency-based explanations. Our extensive\nresults show that different frequencies are important to FR models depending on\nthe ethnicity of the samples.\n","authors":["Marco Huber","Fadi Boutros","Naser Damer"],"pdf_url":"https://arxiv.org/pdf/2501.16896v1.pdf","comment":"Accepted at xAI4Biometrics at ECCV 2024"},{"id":"http://arxiv.org/abs/2501.16889v1","updated":"2025-01-28T12:19:44Z","published":"2025-01-28T12:19:44Z","title":"Extending Information Bottleneck Attribution to Video Sequences","summary":" We introduce VIBA, a novel approach for explainable video classification by\nadapting Information Bottlenecks for Attribution (IBA) to video sequences.\nWhile most traditional explainability methods are designed for image models,\nour IBA framework addresses the need for explainability in temporal models used\nfor video analysis. To demonstrate its effectiveness, we apply VIBA to video\ndeepfake detection, testing it on two architectures: the Xception model for\nspatial features and a VGG11-based model for capturing motion dynamics through\noptical flow. Using a custom dataset that reflects recent deepfake generation\ntechniques, we adapt IBA to create relevance and optical flow maps, visually\nhighlighting manipulated regions and motion inconsistencies. Our results show\nthat VIBA generates temporally and spatially consistent explanations, which\nalign closely with human annotations, thus providing interpretability for video\nclassification and particularly for deepfake detection.\n","authors":["Veronika Solopova","Lucas Schmidt","Dorothea Kolossa"],"pdf_url":"https://arxiv.org/pdf/2501.16889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16879v1","updated":"2025-01-28T12:06:29Z","published":"2025-01-28T12:06:29Z","title":"Ultra-high resolution multimodal MRI dense labelled holistic brain atlas","summary":" In this paper, we introduce holiAtlas, a holistic, multimodal and\nhigh-resolution human brain atlas. This atlas covers different levels of\ndetails of the human brain anatomy, from the organ to the substructure level,\nusing a new dense labelled protocol generated from the fusion of multiple local\nprotocols at different scales. This atlas has been constructed averaging images\nand segmentations of 75 healthy subjects from the Human Connectome Project\ndatabase. Specifically, MR images of T1, T2 and WMn (White Matter nulled)\ncontrasts at 0.125 $mm^{3}$ resolution that were nonlinearly registered and\naveraged using symmetric group-wise normalisation to construct the atlas. At\nthe finest level, the holiAtlas protocol has 350 different labels derived from\n10 different delineation protocols. These labels were grouped at different\nscales to provide a holistic view of the brain at different levels in a\ncoherent and consistent manner. This multiscale and multimodal atlas can be\nused for the development of new ultra-high resolution segmentation methods that\ncan potentially leverage the early detection of neurological disorders.\n","authors":["José V. Manjón","Sergio Morell-Ortega","Marina Ruiz-Perez","Boris Mansencal","Edern Le Bot","Marien Gadea","Enrique Lanuza","Gwenaelle Catheline","Thomas Tourdias","Vincent Planche","Rémi Giraud","Denis Rivière","Jean-François Mangin","Nicole Labra-Avila","Roberto Vivo-Hernando","Gregorio Rubio","Fernando Aparici","Maria de la Iglesia-Vaya","Pierrick Coupé"],"pdf_url":"https://arxiv.org/pdf/2501.16879v1.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2501.16870v1","updated":"2025-01-28T11:42:15Z","published":"2025-01-28T11:42:15Z","title":"Experimenting with Affective Computing Models in Video Interviews with\n Spanish-speaking Older Adults","summary":" Understanding emotional signals in older adults is crucial for designing\nvirtual assistants that support their well-being. However, existing affective\ncomputing models often face significant limitations: (1) limited availability\nof datasets representing older adults, especially in non-English-speaking\npopulations, and (2) poor generalization of models trained on younger or\nhomogeneous demographics. To address these gaps, this study evaluates\nstate-of-the-art affective computing models -- including facial expression\nrecognition, text sentiment analysis, and smile detection -- using videos of\nolder adults interacting with either a person or a virtual avatar. As part of\nthis effort, we introduce a novel dataset featuring Spanish-speaking older\nadults engaged in human-to-human video interviews. Through three comprehensive\nanalyses, we investigate (1) the alignment between human-annotated labels and\nautomatic model outputs, (2) the relationships between model outputs across\ndifferent modalities, and (3) individual variations in emotional signals. Using\nboth the Wizard of Oz (WoZ) dataset and our newly collected dataset, we uncover\nlimited agreement between human annotations and model predictions, weak\nconsistency across modalities, and significant variability among individuals.\nThese findings highlight the shortcomings of generalized emotion perception\nmodels and emphasize the need of incorporating personal variability and\ncultural nuances into future systems.\n","authors":["Josep Lopez Camunas","Cristina Bustos","Yanjun Zhu","Raquel Ros","Agata Lapedriza"],"pdf_url":"https://arxiv.org/pdf/2501.16870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09293v3","updated":"2025-01-28T11:15:45Z","published":"2024-06-13T16:29:46Z","title":"StableMaterials: Enhancing Diversity in Material Generation via\n Semi-Supervised Learning","summary":" We introduce StableMaterials, a novel approach for generating photorealistic\nphysical-based rendering (PBR) materials that integrate semi-supervised\nlearning with Latent Diffusion Models (LDMs). Our method employs adversarial\ntraining to distill knowledge from existing large-scale image generation\nmodels, minimizing the reliance on annotated data and enhancing the diversity\nin generation. This distillation approach aligns the distribution of the\ngenerated materials with that of image textures from an SDXL model, enabling\nthe generation of novel materials that are not present in the initial training\ndataset. Furthermore, we employ a diffusion-based refiner model to improve the\nvisual quality of the samples and achieve high-resolution generation. Finally,\nwe distill a latent consistency model for fast generation in just four steps\nand propose a new tileability technique that removes visual artifacts typically\nassociated with fewer diffusion steps. We detail the architecture and training\nprocess of StableMaterials, the integration of semi-supervised training within\nexisting LDM frameworks and show the advantages of our approach. Comparative\nevaluations with state-of-the-art methods show the effectiveness of\nStableMaterials, highlighting its potential applications in computer graphics\nand beyond. StableMaterials is publicly available at\nhttps://gvecchio.com/stablematerials.\n","authors":["Giuseppe Vecchio"],"pdf_url":"https://arxiv.org/pdf/2406.09293v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13106v3","updated":"2025-01-28T11:05:18Z","published":"2025-01-22T18:59:46Z","title":"VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video\n Understanding","summary":" In this paper, we propose VideoLLaMA3, a more advanced multimodal foundation\nmodel for image and video understanding. The core design philosophy of\nVideoLLaMA3 is vision-centric. The meaning of \"vision-centric\" is two-fold: the\nvision-centric training paradigm and vision-centric framework design. The key\ninsight of our vision-centric training paradigm is that high-quality image-text\ndata is crucial for both image and video understanding. Instead of preparing\nmassive video-text datasets, we focus on constructing large-scale and\nhigh-quality image-text datasets. VideoLLaMA3 has four training stages: 1)\nVision Encoder Adaptation, which enables vision encoder to accept images of\nvariable resolutions as input; 2) Vision-Language Alignment, which jointly\ntunes the vision encoder, projector, and LLM with large-scale image-text data\ncovering multiple types (including scene images, documents, charts) as well as\ntext-only data. 3) Multi-task Fine-tuning, which incorporates image-text SFT\ndata for downstream tasks and video-text data to establish a foundation for\nvideo understanding. 4) Video-centric Fine-tuning, which further improves the\nmodel's capability in video understanding. As for the framework design, to\nbetter capture fine-grained details in images, the pretrained vision encoder is\nadapted to encode images of varying sizes into vision tokens with corresponding\nnumbers, rather than a fixed number of tokens. For video inputs, we reduce the\nnumber of vision tokens according to their similarity so that the\nrepresentation of videos will be more precise and compact. Benefit from\nvision-centric designs, VideoLLaMA3 achieves compelling performances in both\nimage and video understanding benchmarks.\n","authors":["Boqiang Zhang","Kehan Li","Zesen Cheng","Zhiqiang Hu","Yuqian Yuan","Guanzheng Chen","Sicong Leng","Yuming Jiang","Hang Zhang","Xin Li","Peng Jin","Wenqi Zhang","Fan Wang","Lidong Bing","Deli Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.13106v3.pdf","comment":"BZ, KL, ZC, ZH, YY, GC, SL, YJ, HZ, and XL contributed equally to\n this project. Code: https://github.com/DAMO-NLP-SG/VideoLLaMA3"},{"id":"http://arxiv.org/abs/2407.15719v2","updated":"2025-01-28T10:19:49Z","published":"2024-07-22T15:22:33Z","title":"GFE-Mamba: Mamba-based AD Multi-modal Progression Assessment via\n Generative Feature Extraction from MCI","summary":" Alzheimer's Disease (AD) is a progressive, irreversible neurodegenerative\ndisorder that often originates from Mild Cognitive Impairment (MCI). This\nprogression results in significant memory loss and severely affects patients'\nquality of life. Clinical trials have consistently shown that early and\ntargeted interventions for individuals with MCI may slow or even prevent the\nadvancement of AD. Research indicates that accurate medical classification\nrequires diverse multimodal data, including detailed assessment scales and\nneuroimaging techniques like Magnetic Resonance Imaging (MRI) and Positron\nEmission Tomography (PET). However, simultaneously collecting the\naforementioned three modalities for training presents substantial challenges.\nTo tackle these difficulties, we propose GFE-Mamba, a multimodal classifier\nfounded on Generative Feature Extractor. The intermediate features provided by\nthis Extractor can compensate for the shortcomings of PET and achieve profound\nmultimodal fusion in the classifier. The Mamba block, as the backbone of the\nclassifier, enables it to efficiently extract information from long-sequence\nscale information. Pixel-level Bi-cross Attention supplements pixel-level\ninformation from MRI and PET. We provide our rationale for developing this\ncross-temporal progression prediction dataset and the pre-trained Extractor\nweights. Our experimental findings reveal that the GFE-Mamba model effectively\npredicts the progression from MCI to AD and surpasses several leading methods\nin the field. Our source code is available at\nhttps://github.com/Tinysqua/GFE-Mamba.\n","authors":["Zhaojie Fang","Shenghao Zhu","Yifei Chen","Binfeng Zou","Fan Jia","Linwei Qiu","Chang Liu","Xiang Feng","Changmiao Wang","Feiwei Qin","Jin Fan","Changbiao Chu"],"pdf_url":"https://arxiv.org/pdf/2407.15719v2.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.15187v2","updated":"2025-01-28T09:44:28Z","published":"2025-01-25T11:51:23Z","title":"Uni-Sign: Toward Unified Sign Language Understanding at Scale","summary":" Sign language pre-training has gained increasing attention for its ability to\nenhance performance across various sign language understanding (SLU) tasks.\nHowever, existing methods often suffer from a gap between pre-training and\nfine-tuning, leading to suboptimal results. To address this, we propose\nUni-Sign, a unified pre-training framework that eliminates the gap between\npre-training and downstream SLU tasks through a large-scale generative\npre-training strategy and a novel fine-tuning paradigm. First, we introduce\nCSL-News, a large-scale Chinese Sign Language (CSL) dataset containing 1,985\nhours of video paired with textual annotations, which enables effective\nlarge-scale pre-training. Second, Uni-Sign unifies SLU tasks by treating\ndownstream tasks as a single sign language translation (SLT) task during\nfine-tuning, ensuring seamless knowledge transfer between pre-training and\nfine-tuning. Furthermore, we incorporate a prior-guided fusion (PGF) module and\na score-aware sampling strategy to efficiently fuse pose and RGB information,\naddressing keypoint inaccuracies and improving computational efficiency.\nExtensive experiments across multiple SLU benchmarks demonstrate that Uni-Sign\nachieves state-of-the-art performance across multiple downstream SLU tasks.\nDataset and code are available at github.com/ZechengLi19/Uni-Sign.\n","authors":["Zecheng Li","Wengang Zhou","Weichao Zhao","Kepeng Wu","Hezhen Hu","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2501.15187v2.pdf","comment":"Accepted by ICLR 2025"},{"id":"http://arxiv.org/abs/2404.07594v2","updated":"2025-01-28T09:32:08Z","published":"2024-04-11T09:23:44Z","title":"Weakly-Supervised Learning via Multi-Lateral Decoder Branching for Tool\n Segmentation in Robot-Assisted Cardiovascular Catheterization","summary":" Robot-assisted catheterization has garnered a good attention for its\npotentials in treating cardiovascular diseases. However, advancing\nsurgeon-robot collaboration still requires further research, particularly on\ntask-specific automation. For instance, automated tool segmentation can assist\nsurgeons in visualizing and tracking of endovascular tools during cardiac\nprocedures. While learning-based models have demonstrated state-of-the-art\nsegmentation performances, generating ground-truth labels for fully-supervised\nmethods is both labor-intensive time consuming, and costly. In this study, we\npropose a weakly-supervised learning method with multi-lateral pseudo labeling\nfor tool segmentation in cardiovascular angiogram datasets. The method utilizes\na modified U-Net architecture featuring one encoder and multiple laterally\nbranched decoders. The decoders generate diverse pseudo labels under different\nperturbations, augmenting available partial labels. The pseudo labels are\nself-generated using a mixed loss function with shared consistency across the\ndecoders. The weakly-supervised model was trained end-to-end and validated\nusing partially annotated angiogram data from three cardiovascular\ncatheterization procedures. Validation results show that the model could\nperform closer to fully-supervised models. Also, the proposed weakly-supervised\nmulti-lateral method outperforms three well known methods used for\nweakly-supervised learning, offering the highest segmentation performance\nacross the three angiogram datasets. Furthermore, numerous ablation studies\nconfirmed the model's consistent performance under different parameters.\nFinally, the model was applied for tool segmentation in a robot-assisted\ncatheterization experiments. The model enhanced visualization with high\nconnectivity indices for guidewire and catheter, and a mean processing time of\n35 ms per frame.\n","authors":["Olatunji Mumini Omisore","Toluwanimi Akinyemi","Anh Nguyen","Lei Wang"],"pdf_url":"https://arxiv.org/pdf/2404.07594v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16811v1","updated":"2025-01-28T09:29:13Z","published":"2025-01-28T09:29:13Z","title":"Not Every Patch is Needed: Towards a More Efficient and Effective\n Backbone for Video-based Person Re-identification","summary":" This paper proposes a new effective and efficient plug-and-play backbone for\nvideo-based person re-identification (ReID). Conventional video-based ReID\nmethods typically use CNN or transformer backbones to extract deep features for\nevery position in every sampled video frame. Here, we argue that this\nexhaustive feature extraction could be unnecessary, since we find that\ndifferent frames in a ReID video often exhibit small differences and contain\nmany similar regions due to the relatively slight movements of human beings.\nInspired by this, a more selective, efficient paradigm is explored in this\npaper. Specifically, we introduce a patch selection mechanism to reduce\ncomputational cost by choosing only the crucial and non-repetitive patches for\nfeature extraction. Additionally, we present a novel network structure that\ngenerates and utilizes pseudo frame global context to address the issue of\nincomplete views resulting from sparse inputs. By incorporating these new\ndesigns, our backbone can achieve both high performance and low computational\ncost. Extensive experiments on multiple datasets show that our approach reduces\nthe computational cost by 74\\% compared to ViT-B and 28\\% compared to ResNet50,\nwhile the accuracy is on par with ViT-B and outperforms ResNet50 significantly.\n","authors":["Lanyun Zhu","Tianrun Chen","Deyi Ji","Jieping Ye","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2501.16811v1.pdf","comment":"IEEE TIP"},{"id":"http://arxiv.org/abs/2501.08137v2","updated":"2025-01-28T09:14:14Z","published":"2025-01-14T14:15:10Z","title":"Audio-Visual Deepfake Detection With Local Temporal Inconsistencies","summary":" This paper proposes an audio-visual deepfake detection approach that aims to\ncapture fine-grained temporal inconsistencies between audio and visual\nmodalities. To achieve this, both architectural and data synthesis strategies\nare introduced. From an architectural perspective, a temporal distance map,\ncoupled with an attention mechanism, is designed to capture these\ninconsistencies while minimizing the impact of irrelevant temporal\nsubsequences. Moreover, we explore novel pseudo-fake generation techniques to\nsynthesize local inconsistencies. Our approach is evaluated against\nstate-of-the-art methods using the DFDC and FakeAVCeleb datasets, demonstrating\nits effectiveness in detecting audio-visual deepfakes.\n","authors":["Marcella Astrid","Enjie Ghorbel","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2501.08137v2.pdf","comment":"Accepted in ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.16803v1","updated":"2025-01-28T09:08:31Z","published":"2025-01-28T09:08:31Z","title":"RG-Attn: Radian Glue Attention for Multi-modality Multi-agent\n Cooperative Perception","summary":" Cooperative perception offers an optimal solution to overcome the perception\nlimitations of single-agent systems by leveraging Vehicle-to-Everything (V2X)\ncommunication for data sharing and fusion across multiple agents. However, most\nexisting approaches focus on single-modality data exchange, limiting the\npotential of both homogeneous and heterogeneous fusion across agents. This\noverlooks the opportunity to utilize multi-modality data per agent, restricting\nthe system's performance. In the automotive industry, manufacturers adopt\ndiverse sensor configurations, resulting in heterogeneous combinations of\nsensor modalities across agents. To harness the potential of every possible\ndata source for optimal performance, we design a robust LiDAR and camera\ncross-modality fusion module, Radian-Glue-Attention (RG-Attn), applicable to\nboth intra-agent cross-modality fusion and inter-agent cross-modality fusion\nscenarios, owing to the convenient coordinate conversion by transformation\nmatrix and the unified sampling/inversion mechanism. We also propose two\ndifferent architectures, named Paint-To-Puzzle (PTP) and\nCo-Sketching-Co-Coloring (CoS-CoCo), for conducting cooperative perception. PTP\naims for maximum precision performance and achieves smaller data packet size by\nlimiting cross-agent fusion to a single instance, but requiring all\nparticipants to be equipped with LiDAR. In contrast, CoS-CoCo supports agents\nwith any configuration-LiDAR-only, camera-only, or LiDAR-camera-both,\npresenting more generalization ability. Our approach achieves state-of-the-art\n(SOTA) performance on both real and simulated cooperative perception datasets.\nThe code will be released at GitHub in early 2025.\n","authors":["Lantao Li","Kang Yang","Wenqi Zhang","Xiaoxue Wang","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2501.16803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16787v1","updated":"2025-01-28T08:33:59Z","published":"2025-01-28T08:33:59Z","title":"Dynamic Hypergraph Representation for Bone Metastasis Cancer Analysis","summary":" Bone metastasis analysis is a significant challenge in pathology and plays a\ncritical role in determining patient quality of life and treatment strategies.\nThe microenvironment and specific tissue structures are essential for\npathologists to predict the primary bone cancer origins and primary bone cancer\nsubtyping. By digitizing bone tissue sections into whole slide images (WSIs)\nand leveraging deep learning to model slide embeddings, this analysis can be\nenhanced. However, tumor metastasis involves complex multivariate interactions\nwith diverse bone tissue structures, which traditional WSI analysis methods\nsuch as multiple instance learning (MIL) fail to capture. Moreover, graph\nneural networks (GNNs), limited to modeling pairwise relationships, are hard to\nrepresent high-order biological associations. To address these challenges, we\npropose a dynamic hypergraph neural network (DyHG) that overcomes the edge\nconstruction limitations of traditional graph representations by connecting\nmultiple nodes via hyperedges. A low-rank strategy is used to reduce the\ncomplexity of parameters in learning hypergraph structures, while a\nGumbel-Softmax-based sampling strategy optimizes the patch distribution across\nhyperedges. An MIL aggregator is then used to derive a graph-level embedding\nfor comprehensive WSI analysis. To evaluate the effectiveness of DyHG, we\nconstruct two large-scale datasets for primary bone cancer origins and\nsubtyping classification based on real-world bone metastasis scenarios.\nExtensive experiments demonstrate that DyHG significantly outperforms\nstate-of-the-art (SOTA) baselines, showcasing its ability to model complex\nbiological interactions and improve the accuracy of bone metastasis analysis.\n","authors":["Yuxuan Chen","Jiawen Li","Huijuan Shi","Yang Xu","Tian Guan","Lianghui Zhu","Yonghong He","Anjia Han"],"pdf_url":"https://arxiv.org/pdf/2501.16787v1.pdf","comment":"12 pages,11 figures"},{"id":"http://arxiv.org/abs/2501.15878v2","updated":"2025-01-28T08:33:41Z","published":"2025-01-27T09:03:34Z","title":"Slot-Guided Adaptation of Pre-trained Diffusion Models for\n Object-Centric Learning and Compositional Generation","summary":" We present SlotAdapt, an object-centric learning method that combines slot\nattention with pretrained diffusion models by introducing adapters for\nslot-based conditioning. Our method preserves the generative power of\npretrained diffusion models, while avoiding their text-centric conditioning\nbias. We also incorporate an additional guidance loss into our architecture to\nalign cross-attention from adapter layers with slot attention. This enhances\nthe alignment of our model with the objects in the input image without using\nexternal supervision. Experimental results show that our method outperforms\nstate-of-the-art techniques in object discovery and image generation tasks\nacross multiple datasets, including those with real images. Furthermore, we\ndemonstrate through experiments that our method performs remarkably well on\ncomplex real-world images for compositional generation, in contrast to other\nslot-based generative methods in the literature. The project page can be found\nat https://kaanakan.github.io/SlotAdapt/.\n","authors":["Adil Kaan Akan","Yucel Yemez"],"pdf_url":"https://arxiv.org/pdf/2501.15878v2.pdf","comment":"Accepted to ICLR2025. Project page:\n https://kaanakan.github.io/SlotAdapt/"},{"id":"http://arxiv.org/abs/2501.16786v1","updated":"2025-01-28T08:30:58Z","published":"2025-01-28T08:30:58Z","title":"Exploring the Role of Explicit Temporal Modeling in Multimodal Large\n Language Models for Video Understanding","summary":" Applying Multimodal Large Language Models (MLLMs) to video understanding\npresents significant challenges due to the need to model temporal relations\nacross frames. Existing approaches adopt either implicit temporal modeling,\nrelying solely on the LLM decoder, or explicit temporal modeling, employing\nauxiliary temporal encoders. To investigate this debate between the two\nparadigms, we propose the Stackable Temporal Encoder (STE). STE enables\nflexible explicit temporal modeling with adjustable temporal receptive fields\nand token compression ratios. Using STE, we systematically compare implicit and\nexplicit temporal modeling across dimensions such as overall performance, token\ncompression effectiveness, and temporal-specific understanding. We also explore\nSTE's design considerations and broader impacts as a plug-in module and in\nimage modalities. Our findings emphasize the critical role of explicit temporal\nmodeling, providing actionable insights to advance video MLLMs.\n","authors":["Yun Li","Zhe Liu","Yajing Kong","Guangrui Li","Jiyuan Zhang","Chao Bian","Feng Liu","Lina Yao","Zhenbang Sun"],"pdf_url":"https://arxiv.org/pdf/2501.16786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.06360v2","updated":"2025-01-28T08:10:15Z","published":"2021-06-11T13:01:03Z","title":"Conterfactual Generative Zero-Shot Semantic Segmentation","summary":" zero-shot learning is an essential part of computer vision. As a classical\ndownstream task, zero-shot semantic segmentation has been studied because of\nits applicant value. One of the popular zero-shot semantic segmentation methods\nis based on the generative model Most new proposed works added structures on\nthe same architecture to enhance this model. However, we found that, from the\nview of causal inference, the result of the original model has been influenced\nby spurious statistical relationships. Thus the performance of the prediction\nshows severe bias. In this work, we consider counterfactual methods to avoid\nthe confounder in the original model. Based on this method, we proposed a new\nframework for zero-shot semantic segmentation. Our model is compared with\nbaseline models on two real-world datasets, Pascal-VOC and Pascal-Context. The\nexperiment results show proposed models can surpass previous confounded models\nand can still make use of additional structures to improve the performance. We\nalso design a simple structure based on Graph Convolutional Networks (GCN) in\nthis work.\n","authors":["Feihong Shen","Jun Liu","Ping Hu"],"pdf_url":"https://arxiv.org/pdf/2106.06360v2.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.16778v1","updated":"2025-01-28T08:02:21Z","published":"2025-01-28T08:02:21Z","title":"FlexMotion: Lightweight, Physics-Aware, and Controllable Human Motion\n Generation","summary":" Lightweight, controllable, and physically plausible human motion synthesis is\ncrucial for animation, virtual reality, robotics, and human-computer\ninteraction applications. Existing methods often compromise between\ncomputational efficiency, physical realism, or spatial controllability. We\npropose FlexMotion, a novel framework that leverages a computationally\nlightweight diffusion model operating in the latent space, eliminating the need\nfor physics simulators and enabling fast and efficient training. FlexMotion\nemploys a multimodal pre-trained Transformer encoder-decoder, integrating joint\nlocations, contact forces, joint actuations and muscle activations to ensure\nthe physical plausibility of the generated motions. FlexMotion also introduces\na plug-and-play module, which adds spatial controllability over a range of\nmotion parameters (e.g., joint locations, joint actuations, contact forces, and\nmuscle activations). Our framework achieves realistic motion generation with\nimproved efficiency and control, setting a new benchmark for human motion\nsynthesis. We evaluate FlexMotion on extended datasets and demonstrate its\nsuperior performance in terms of realism, physical plausibility, and\ncontrollability.\n","authors":["Arvin Tashakori","Arash Tashakori","Gongbo Yang","Z. Jane Wang","Peyman Servati"],"pdf_url":"https://arxiv.org/pdf/2501.16778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16769v1","updated":"2025-01-28T07:49:52Z","published":"2025-01-28T07:49:52Z","title":"Beyond-Labels: Advancing Open-Vocabulary Segmentation With\n Vision-Language Models","summary":" Self-supervised learning can resolve numerous image or linguistic processing\nproblems when effectively trained. This study investigated simple yet efficient\nmethods for adaping previously learned foundation models for open-vocabulary\nsemantic segmentation tasks. Our research proposed \"Beyond-Labels,\" a\nlightweight transformer-based fusion module that uses a handful of image\nsegmentation data to fuse frozen image representations with language concepts.\nFurthermore, we efficiently captured positional information in images using\nFourier embeddings, thus improving the generalization across various image\nsizes. Extensive ablation tests were performed to investigate the important\ncomponents of our proposed method; when tested against the common benchmark\nPASCAL-5i, it demonstrated superior performance despite being trained on frozen\nimage and language characteristics.\n","authors":["Muhammad Atta ur Rahman"],"pdf_url":"https://arxiv.org/pdf/2501.16769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16767v1","updated":"2025-01-28T07:46:13Z","published":"2025-01-28T07:46:13Z","title":"Target-driven Self-Distillation for Partial Observed Trajectories\n Forecasting","summary":" Accurate prediction of future trajectories of traffic agents is essential for\nensuring safe autonomous driving. However, partially observed trajectories can\nsignificantly degrade the performance of even state-of-the-art models. Previous\napproaches often rely on knowledge distillation to transfer features from fully\nobserved trajectories to partially observed ones. This involves firstly\ntraining a fully observed model and then using a distillation process to create\nthe final model. While effective, they require multi-stage training, making the\ntraining process very expensive. Moreover, knowledge distillation can lead to a\nperformance degradation of the model. In this paper, we introduce a\nTarget-driven Self-Distillation method (TSD) for motion forecasting. Our method\nleverages predicted accurate targets to guide the model in making predictions\nunder partial observation conditions. By employing self-distillation, the model\nlearns from the feature distributions of both fully observed and partially\nobserved trajectories during a single end-to-end training process. This\nenhances the model's ability to predict motion accurately in both fully\nobserved and partially observed scenarios. We evaluate our method on multiple\ndatasets and state-of-the-art motion forecasting models. Extensive experimental\nresults demonstrate that our approach achieves significant performance\nimprovements in both settings. To facilitate further research, we will release\nour code and model checkpoints.\n","authors":["Pengfei Zhu","Peng Shu","Mengshi Qi","Liang Liu","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2501.16767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16764v1","updated":"2025-01-28T07:38:59Z","published":"2025-01-28T07:38:59Z","title":"DiffSplat: Repurposing Image Diffusion Models for Scalable Gaussian\n Splat Generation","summary":" Recent advancements in 3D content generation from text or a single image\nstruggle with limited high-quality 3D datasets and inconsistency from 2D\nmulti-view generation. We introduce DiffSplat, a novel 3D generative framework\nthat natively generates 3D Gaussian splats by taming large-scale text-to-image\ndiffusion models. It differs from previous 3D generative models by effectively\nutilizing web-scale 2D priors while maintaining 3D consistency in a unified\nmodel. To bootstrap the training, a lightweight reconstruction model is\nproposed to instantly produce multi-view Gaussian splat grids for scalable\ndataset curation. In conjunction with the regular diffusion loss on these\ngrids, a 3D rendering loss is introduced to facilitate 3D coherence across\narbitrary views. The compatibility with image diffusion models enables seamless\nadaptions of numerous techniques for image generation to the 3D realm.\nExtensive experiments reveal the superiority of DiffSplat in text- and\nimage-conditioned generation tasks and downstream applications. Thorough\nablation studies validate the efficacy of each critical design choice and\nprovide insights into the underlying mechanism.\n","authors":["Chenguo Lin","Panwang Pan","Bangbang Yang","Zeming Li","Yadong Mu"],"pdf_url":"https://arxiv.org/pdf/2501.16764v1.pdf","comment":"Accepted to ICLR 2025; Project page:\n https://chenguolin.github.io/projects/DiffSplat"},{"id":"http://arxiv.org/abs/2501.16760v1","updated":"2025-01-28T07:31:09Z","published":"2025-01-28T07:31:09Z","title":"AdaSemSeg: An Adaptive Few-shot Semantic Segmentation of Seismic Facies","summary":" Automated interpretation of seismic images using deep learning methods is\nchallenging because of the limited availability of training data. Few-shot\nlearning is a suitable learning paradigm in such scenarios due to its ability\nto adapt to a new task with limited supervision (small training budget).\nExisting few-shot semantic segmentation (FSSS) methods fix the number of target\nclasses. Therefore, they do not support joint training on multiple datasets\nvarying in the number of classes. In the context of the interpretation of\nseismic facies, fixing the number of target classes inhibits the generalization\ncapability of a model trained on one facies dataset to another, which is likely\nto have a different number of facies. To address this shortcoming, we propose a\nfew-shot semantic segmentation method for interpreting seismic facies that can\nadapt to the varying number of facies across the dataset, dubbed the AdaSemSeg.\nIn general, the backbone network of FSSS methods is initialized with the\nstatistics learned from the ImageNet dataset for better performance. The lack\nof such a huge annotated dataset for seismic images motivates using a\nself-supervised algorithm on seismic datasets to initialize the backbone\nnetwork. We have trained the AdaSemSeg on three public seismic facies datasets\nwith different numbers of facies and evaluated the proposed method on multiple\nmetrics. The performance of the AdaSemSeg on unseen datasets (not used in\ntraining) is better than the prototype-based few-shot method and baselines.\n","authors":["Surojit Saha","Ross Whitaker"],"pdf_url":"https://arxiv.org/pdf/2501.16760v1.pdf","comment":"Under review at IEEE Transactions on Geoscience and Remote Sensing"},{"id":"http://arxiv.org/abs/2501.16757v1","updated":"2025-01-28T07:24:15Z","published":"2025-01-28T07:24:15Z","title":"ITVTON:Virtual Try-On Diffusion Transformer Model Based on Integrated\n Image and Text","summary":" Recent advancements in virtual fitting for characters and clothing have\nleveraged diffusion models to improve the realism of garment fitting. However,\nchallenges remain in handling complex scenes and poses, which can result in\nunnatural garment fitting and poorly rendered intricate patterns. In this work,\nwe introduce ITVTON, a novel method that enhances clothing-character\ninteractions by combining clothing and character images along spatial channels\nas inputs, thereby improving fitting accuracy for the inpainting model.\nAdditionally, we incorporate integrated textual descriptions from multiple\nimages to boost the realism of the generated visual effects. To optimize\ncomputational efficiency, we limit training to the attention parameters within\na single diffusion transformer (Single-DiT) block. To more rigorously address\nthe complexities of real-world scenarios, we curated training samples from the\nIGPair dataset, thereby enhancing ITVTON's performance across diverse\nenvironments. Extensive experiments demonstrate that ITVTON outperforms\nbaseline methods both qualitatively and quantitatively, setting a new standard\nfor virtual fitting tasks.\n","authors":["Haifeng Ni"],"pdf_url":"https://arxiv.org/pdf/2501.16757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16754v1","updated":"2025-01-28T07:15:39Z","published":"2025-01-28T07:15:39Z","title":"SSF-PAN: Semantic Scene Flow-Based Perception for Autonomous Navigation\n in Traffic Scenarios","summary":" Vehicle detection and localization in complex traffic scenarios pose\nsignificant challenges due to the interference of moving objects. Traditional\nmethods often rely on outlier exclusions or semantic segmentations, which\nsuffer from low computational efficiency and accuracy. The proposed SSF-PAN can\nachieve the functionalities of LiDAR point cloud based object\ndetection/localization and SLAM (Simultaneous Localization and Mapping) with\nhigh computational efficiency and accuracy, enabling map-free navigation\nframeworks. The novelty of this work is threefold: 1) developing a neural\nnetwork which can achieve segmentation among static and dynamic objects within\nthe scene flows with different motion features, that is, semantic scene flow\n(SSF); 2) developing an iterative framework which can further optimize the\nquality of input scene flows and output segmentation results; 3) developing a\nscene flow-based navigation platform which can test the performance of the SSF\nperception system in the simulation environment. The proposed SSF-PAN method is\nvalidated using the SUScape-CARLA and the KITTI datasets, as well as on the\nCARLA simulator. Experimental results demonstrate that the proposed approach\noutperforms traditional methods in terms of scene flow computation accuracy,\nmoving object detection accuracy, computational efficiency, and autonomous\nnavigation effectiveness.\n","authors":["Yinqi Chen","Meiying Zhang","Qi Hao","Guang Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.16754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16753v1","updated":"2025-01-28T07:12:29Z","published":"2025-01-28T07:12:29Z","title":"Overcoming Semantic Dilution in Transformer-Based Next Frame Prediction","summary":" Next-frame prediction in videos is crucial for applications such as\nautonomous driving, object tracking, and motion prediction. The primary\nchallenge in next-frame prediction lies in effectively capturing and processing\nboth spatial and temporal information from previous video sequences. The\ntransformer architecture, known for its prowess in handling sequence data, has\nmade remarkable progress in this domain. However, transformer-based next-frame\nprediction models face notable issues: (a) The multi-head self-attention (MHSA)\nmechanism requires the input embedding to be split into $N$ chunks, where $N$\nis the number of heads. Each segment captures only a fraction of the original\nembeddings information, which distorts the representation of the embedding in\nthe latent space, resulting in a semantic dilution problem; (b) These models\npredict the embeddings of the next frames rather than the frames themselves,\nbut the loss function based on the errors of the reconstructed frames, not the\npredicted embeddings -- this creates a discrepancy between the training\nobjective and the model output. We propose a Semantic Concentration Multi-Head\nSelf-Attention (SCMHSA) architecture, which effectively mitigates semantic\ndilution in transformer-based next-frame prediction. Additionally, we introduce\na loss function that optimizes SCMHSA in the latent space, aligning the\ntraining objective more closely with the model output. Our method demonstrates\nsuperior performance compared to the original transformer-based predictors.\n","authors":["Hy Nguyen","Srikanth Thudumu","Hung Du","Rajesh Vasa","Kon Mouzakis"],"pdf_url":"https://arxiv.org/pdf/2501.16753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16751v1","updated":"2025-01-28T07:08:20Z","published":"2025-01-28T07:08:20Z","title":"DebugAgent: Efficient and Interpretable Error Slice Discovery for\n Comprehensive Model Debugging","summary":" Despite the significant success of deep learning models in computer vision,\nthey often exhibit systematic failures on specific data subsets, known as error\nslices. Identifying and mitigating these error slices is crucial to enhancing\nmodel robustness and reliability in real-world scenarios. In this paper, we\nintroduce DebugAgent, an automated framework for error slice discovery and\nmodel repair. DebugAgent first generates task-specific visual attributes to\nhighlight instances prone to errors through an interpretable and structured\nprocess. It then employs an efficient slice enumeration algorithm to\nsystematically identify error slices, overcoming the combinatorial challenges\nthat arise during slice exploration. Additionally, DebugAgent extends its\ncapabilities by predicting error slices beyond the validation set, addressing a\nkey limitation of prior approaches. Extensive experiments across multiple\ndomains, including image classification, pose estimation, and object detection\n- show that DebugAgent not only improves the coherence and precision of\nidentified error slices but also significantly enhances the model repair\ncapabilities.\n","authors":["Muxi Chen","Chenchen Zhao","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2501.16751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16740v1","updated":"2025-01-28T06:33:30Z","published":"2025-01-28T06:33:30Z","title":"Efficient Knowledge Distillation of SAM for Medical Image Segmentation","summary":" The Segment Anything Model (SAM) has set a new standard in interactive image\nsegmentation, offering robust performance across various tasks. However, its\nsignificant computational requirements limit its deployment in real-time or\nresource-constrained environments. To address these challenges, we propose a\nnovel knowledge distillation approach, KD SAM, which incorporates both encoder\nand decoder optimization through a combination of Mean Squared Error (MSE) and\nPerceptual Loss. This dual-loss framework captures structural and semantic\nfeatures, enabling the student model to maintain high segmentation accuracy\nwhile reducing computational complexity. Based on the model evaluation on\ndatasets, including Kvasir-SEG, ISIC 2017, Fetal Head Ultrasound, and Breast\nUltrasound, we demonstrate that KD SAM achieves comparable or superior\nperformance to the baseline models, with significantly fewer parameters. KD SAM\neffectively balances segmentation accuracy and computational efficiency, making\nit well-suited for real-time medical image segmentation applications in\nresource-constrained environments.\n","authors":["Kunal Dasharath Patil","Gowthamaan Palani","Ganapathy Krishnamurthi"],"pdf_url":"https://arxiv.org/pdf/2501.16740v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.16737v1","updated":"2025-01-28T06:21:57Z","published":"2025-01-28T06:21:57Z","title":"Consistency Diffusion Models for Single-Image 3D Reconstruction with\n Priors","summary":" This paper delves into the study of 3D point cloud reconstruction from a\nsingle image. Our objective is to develop the Consistency Diffusion Model,\nexploring synergistic 2D and 3D priors in the Bayesian framework to ensure\nsuperior consistency in the reconstruction process, a challenging yet critical\nrequirement in this field. Specifically, we introduce a pioneering training\nframework under diffusion models that brings two key innovations. First, we\nconvert 3D structural priors derived from the initial 3D point cloud as a bound\nterm to increase evidence in the variational Bayesian framework, leveraging\nthese robust intrinsic priors to tightly govern the diffusion training process\nand bolster consistency in reconstruction. Second, we extract and incorporate\n2D priors from the single input image, projecting them onto the 3D point cloud\nto enrich the guidance for diffusion training. Our framework not only sidesteps\npotential model learning shifts that may arise from directly imposing\nadditional constraints during training but also precisely transposes the 2D\npriors into the 3D domain. Extensive experimental evaluations reveal that our\napproach sets new benchmarks in both synthetic and real-world datasets. The\ncode is included with the submission.\n","authors":["Chenru Jiang","Chengrui Zhang","Xi Yang","Jie Sun","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2501.16737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16733v1","updated":"2025-01-28T06:18:29Z","published":"2025-01-28T06:18:29Z","title":"Dream to Drive with Predictive Individual World Model","summary":" It is still a challenging topic to make reactive driving behaviors in complex\nurban environments as road users' intentions are unknown. Model-based\nreinforcement learning (MBRL) offers great potential to learn a reactive policy\nby constructing a world model that can provide informative states and\nimagination training. However, a critical limitation in relevant research lies\nin the scene-level reconstruction representation learning, which may overlook\nkey interactive vehicles and hardly model the interactive features among\nvehicles and their long-term intentions. Therefore, this paper presents a novel\nMBRL method with a predictive individual world model (PIWM) for autonomous\ndriving. PIWM describes the driving environment from an individual-level\nperspective and captures vehicles' interactive relations and their intentions\nvia trajectory prediction task. Meanwhile, a behavior policy is learned jointly\nwith PIWM. It is trained in PIWM's imagination and effectively navigates in the\nurban driving scenes leveraging intention-aware latent states. The proposed\nmethod is trained and evaluated on simulation environments built upon\nreal-world challenging interactive scenarios. Compared with popular model-free\nand state-of-the-art model-based reinforcement learning methods, experimental\nresults show that the proposed method achieves the best performance in terms of\nsafety and efficiency.\n","authors":["Yinfeng Gao","Qichao Zhang","Da-wei Ding","Dongbin Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.16733v1.pdf","comment":"Codes: https://github.com/gaoyinfeng/PIWM"},{"id":"http://arxiv.org/abs/2501.16724v1","updated":"2025-01-28T06:04:08Z","published":"2025-01-28T06:04:08Z","title":"B-RIGHT: Benchmark Re-evaluation for Integrity in Generalized\n Human-Object Interaction Testing","summary":" Human-object interaction (HOI) is an essential problem in artificial\nintelligence (AI) which aims to understand the visual world that involves\ncomplex relationships between humans and objects. However, current benchmarks\nsuch as HICO-DET face the following limitations: (1) severe class imbalance and\n(2) varying number of train and test sets for certain classes. These issues can\npotentially lead to either inflation or deflation of model performance during\nevaluation, ultimately undermining the reliability of evaluation scores. In\nthis paper, we propose a systematic approach to develop a new class-balanced\ndataset, Benchmark Re-evaluation for Integrity in Generalized Human-object\nInteraction Testing (B-RIGHT), that addresses these imbalanced problems.\nB-RIGHT achieves class balance by leveraging balancing algorithm and automated\ngeneration-and-filtering processes, ensuring an equal number of instances for\neach HOI class. Furthermore, we design a balanced zero-shot test set to\nsystematically evaluate models on unseen scenario. Re-evaluating existing\nmodels using B-RIGHT reveals substantial the reduction of score variance and\nchanges in performance rankings compared to conventional HICO-DET. Our\nexperiments demonstrate that evaluation under balanced conditions ensure more\nreliable and fair model comparisons.\n","authors":["Yoojin Jang","Junsu Kim","Hayeon Kim","Eun-ki Lee","Eun-sol Kim","Seungryul Baek","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2501.16724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16720v1","updated":"2025-01-28T05:54:55Z","published":"2025-01-28T05:54:55Z","title":"One Head Eight Arms: Block Matrix based Low Rank Adaptation for\n CLIP-based Few-Shot Learning","summary":" Recent advancements in fine-tuning Vision-Language Foundation Models (VLMs)\nhave garnered significant attention for their effectiveness in downstream\nfew-shot learning tasks.While these recent approaches exhibits some performance\nimprovements, they often suffer from excessive training parameters and high\ncomputational costs. To address these challenges, we propose a novel Block\nmatrix-based low-rank adaptation framework, called Block-LoRA, for fine-tuning\nVLMs on downstream few-shot tasks. Inspired by recent work on Low-Rank\nAdaptation (LoRA), Block-LoRA partitions the original low-rank decomposition\nmatrix of LoRA into a series of sub-matrices while sharing all down-projection\nsub-matrices. This structure not only reduces the number of training\nparameters, but also transforms certain complex matrix multiplication\noperations into simpler matrix addition, significantly lowering the\ncomputational cost of fine-tuning. Notably, Block-LoRA enables fine-tuning CLIP\non the ImageNet few-shot benchmark using a single 24GB GPU. We also show that\nBlock-LoRA has the more tighter bound of generalization error than vanilla\nLoRA. Without bells and whistles, extensive experiments demonstrate that\nBlock-LoRA achieves competitive performance compared to state-of-the-art\nCLIP-based few-shot methods, while maintaining a low training parameters count\nand reduced computational overhead.\n","authors":["Chunpeng Zhou","Qianqian Shen","Zhi Yu","Jiajun Bu","Haishuai Wang"],"pdf_url":"https://arxiv.org/pdf/2501.16720v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2501.16716v1","updated":"2025-01-28T05:47:50Z","published":"2025-01-28T05:47:50Z","title":"Point Cloud Upsampling as Statistical Shape Model for Pelvic","summary":" We propose a novel framework that integrates medical image segmentation and\npoint cloud upsampling for accurate shape reconstruction of pelvic models.\nUsing the SAM-Med3D model for segmentation and a point cloud upsampling network\ntrained on the MedShapeNet dataset, our method transforms sparse medical\nimaging data into high-resolution 3D bone models. This framework leverages\nprior knowledge of anatomical shapes, achieving smoother and more complete\nreconstructions. Quantitative evaluations using metrics such as Chamfer\nDistance etc, demonstrate the effectiveness of the point cloud upsampling in\npelvic model. Our approach offers potential applications in reconstructing\nother skeletal structures, providing a robust solution for medical image\nanalysis and statistical shape modeling.\n","authors":["Tongxu Zhang","Bei Wang"],"pdf_url":"https://arxiv.org/pdf/2501.16716v1.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2501.16714v1","updated":"2025-01-28T05:40:20Z","published":"2025-01-28T05:40:20Z","title":"Separate Motion from Appearance: Customizing Motion via Customizing\n Text-to-Video Diffusion Models","summary":" Motion customization aims to adapt the diffusion model (DM) to generate\nvideos with the motion specified by a set of video clips with the same motion\nconcept. To realize this goal, the adaptation of DM should be possible to model\nthe specified motion concept, without compromising the ability to generate\ndiverse appearances. Thus, the key to solving this problem lies in how to\nseparate the motion concept from the appearance in the adaptation process of\nDM. Typical previous works explore different ways to represent and insert a\nmotion concept into large-scale pretrained text-to-video diffusion models,\ne.g., learning a motion LoRA, using latent noise residuals, etc. While those\nmethods can encode the motion concept, they also inevitably encode the\nappearance in the reference videos, resulting in weakened appearance generation\ncapability. In this paper, we follow the typical way to learn a motion LoRA to\nencode the motion concept, but propose two novel strategies to enhance\nmotion-appearance separation, including temporal attention purification (TAP)\nand appearance highway (AH). Specifically, we assume that in the temporal\nattention module, the pretrained Value embeddings are sufficient to serve as\nbasic components needed by producing a new motion. Thus, in TAP, we choose only\nto reshape the temporal attention with motion LoRAs so that Value embeddings\ncan be reorganized to produce a new motion. Further, in AH, we alter the\nstarting point of each skip connection in U-Net from the output of each\ntemporal attention module to the output of each spatial attention module.\nExtensive experiments demonstrate that compared to previous works, our method\ncan generate videos with appearance more aligned with the text descriptions and\nmotion more consistent with the reference videos.\n","authors":["Huijie Liu","Jingyun Wang","Shuai Ma","Jie Hu","Xiaoming Wei","Guoliang Kang"],"pdf_url":"https://arxiv.org/pdf/2501.16714v1.pdf","comment":"8 pages,6 figures"},{"id":"http://arxiv.org/abs/2409.05466v2","updated":"2025-01-28T05:29:55Z","published":"2024-09-09T09:48:27Z","title":"Proto-OOD: Enhancing OOD Object Detection with Prototype Feature\n Similarity","summary":" Neural networks that are trained on limited category samples often mispredict\nout-of-distribution (OOD) objects. We observe that features of the same\ncategory are more tightly clustered in feature space, while those of different\ncategories are more dispersed. Based on this, we propose using prototype\nsimilarity for OOD detection. Drawing on widely used prototype features in\nfew-shot learning, we introduce a novel OOD detection network structure\n(Proto-OOD). Proto-OOD enhances the representativeness of category prototypes\nusing contrastive loss and detects OOD data by evaluating the similarity\nbetween input features and category prototypes. During training, Proto-OOD\ngenerates OOD samples for training the similarity module with a negative\nembedding generator. When Pascal VOC are used as the in-distribution dataset\nand MS-COCO as the OOD dataset, Proto-OOD significantly reduces the FPR (false\npositive rate). Moreover, considering the limitations of existing evaluation\nmetrics, we propose a more reasonable evaluation protocol. The code will be\nreleased.\n","authors":["Junkun Chen","Jilin Mei","Liang Chen","Fangzhou Zhao","Yan Xing","Yu Hu"],"pdf_url":"https://arxiv.org/pdf/2409.05466v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16704v1","updated":"2025-01-28T04:46:50Z","published":"2025-01-28T04:46:50Z","title":"DFCon: Attention-Driven Supervised Contrastive Learning for Robust\n Deepfake Detection","summary":" This report presents our approach for the IEEE SP Cup 2025: Deepfake Face\nDetection in the Wild (DFWild-Cup), focusing on detecting deepfakes across\ndiverse datasets. Our methodology employs advanced backbone models, including\nMaxViT, CoAtNet, and EVA-02, fine-tuned using supervised contrastive loss to\nenhance feature separation. These models were specifically chosen for their\ncomplementary strengths. Integration of convolution layers and strided\nattention in MaxViT is well-suited for detecting local features. In contrast,\nhybrid use of convolution and attention mechanisms in CoAtNet effectively\ncaptures multi-scale features. Robust pretraining with masked image modeling of\nEVA-02 excels at capturing global features. After training, we freeze the\nparameters of these models and train the classification heads. Finally, a\nmajority voting ensemble is employed to combine the predictions from these\nmodels, improving robustness and generalization to unseen scenarios. The\nproposed system addresses the challenges of detecting deepfakes in real-world\nconditions and achieves a commendable accuracy of 95.83% on the validation\ndataset.\n","authors":["MD Sadik Hossain Shanto","Mahir Labib Dihan","Souvik Ghosh","Riad Ahmed Anonto","Hafijul Hoque Chowdhury","Abir Muhtasim","Rakib Ahsan","MD Tanvir Hassan","MD Roqunuzzaman Sojib","Sheikh Azizul Hakim","M. Saifur Rahman"],"pdf_url":"https://arxiv.org/pdf/2501.16704v1.pdf","comment":"Technical report for IEEE Signal Processing Cup 2025, 7 pages"},{"id":"http://arxiv.org/abs/2501.16700v1","updated":"2025-01-28T04:33:28Z","published":"2025-01-28T04:33:28Z","title":"Determining Mosaic Resilience in Sugarcane Plants using Hyperspectral\n Images","summary":" Sugarcane mosaic disease poses a serious threat to the Australian sugarcane\nindustry, leading to yield losses of up to 30% in susceptible varieties.\nExisting manual inspection methods for detecting mosaic resilience are\ninefficient and impractical for large-scale application. This study introduces\na novel approach using hyperspectral imaging and machine learning to detect\nmosaic resilience by leveraging global feature representation from local\nspectral patches. Hyperspectral data were collected from eight sugarcane\nvarieties under controlled and field conditions. Local spectral patches were\nanalyzed to capture spatial and spectral variations, which were then aggregated\ninto global feature representations using a ResNet18 deep learning\narchitecture. While classical methods like Support Vector Machines struggled to\nutilize spatial-spectral relationships effectively, the deep learning model\nachieved high classification accuracy, demonstrating its capacity to identify\nmosaic resilience from fine-grained hyperspectral data. This approach enhances\nearly detection capabilities, enabling more efficient management of susceptible\nstrains and contributing to sustainable sugarcane production.\n","authors":["Ali Zia","Jun Zhou","Muyiwa Olayemi"],"pdf_url":"https://arxiv.org/pdf/2501.16700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16698v1","updated":"2025-01-28T04:31:19Z","published":"2025-01-28T04:31:19Z","title":"3D-MoE: A Mixture-of-Experts Multi-modal LLM for 3D Vision and Pose\n Diffusion via Rectified Flow","summary":" 3D vision and spatial reasoning have long been recognized as preferable for\naccurately perceiving our three-dimensional world, especially when compared\nwith traditional visual reasoning based on 2D images. Due to the difficulties\nin collecting high-quality 3D data, research in this area has only recently\ngained momentum. With the advent of powerful large language models (LLMs),\nmulti-modal LLMs for 3D vision have been developed over the past few years.\nHowever, most of these models focus primarily on the vision encoder for 3D\ndata. In this paper, we propose converting existing densely activated LLMs into\nmixture-of-experts (MoE) models, which have proven effective for multi-modal\ndata processing. In addition to leveraging these models' instruction-following\ncapabilities, we further enable embodied task planning by attaching a diffusion\nhead, Pose-DiT, that employs a novel rectified flow diffusion scheduler.\nExperimental results on 3D question answering and task-planning tasks\ndemonstrate that our 3D-MoE framework achieves improved performance with fewer\nactivated parameters.\n","authors":["Yueen Ma","Yuzheng Zhuang","Jianye Hao","Irwin King"],"pdf_url":"https://arxiv.org/pdf/2501.16698v1.pdf","comment":"Preprint. Work in progress"},{"id":"http://arxiv.org/abs/2501.15757v2","updated":"2025-01-28T04:26:12Z","published":"2025-01-27T04:00:05Z","title":"Efficiency Bottlenecks of Convolutional Kolmogorov-Arnold Networks: A\n Comprehensive Scrutiny with ImageNet, AlexNet, LeNet and Tabular\n Classification","summary":" Algorithmic level developments like Convolutional Neural Networks,\ntransformers, attention mechanism, Retrieval Augmented Generation and so on\nhave changed Artificial Intelligence. Recent such development was observed by\nKolmogorov-Arnold Networks that suggested to challenge the fundamental concept\nof a Neural Network, thus change Multilayer Perceptron, and Convolutional\nNeural Networks. They received a good reception in terms of scientific\nmodeling, yet had some drawbacks in terms of efficiency. In this paper, we\ntrain Convolutional Kolmogorov Arnold Networks (CKANs) with the ImageNet-1k\ndataset with 1.3 million images, MNIST dataset with 60k images and a tabular\nbiological science related MoA dataset and test the promise of CKANs in terms\nof FLOPS, Inference Time, number of trainable parameters and training time\nagainst the accuracy, precision, recall and f-1 score they produce against the\nstandard industry practice on CNN models. We show that the CKANs perform fair\nyet slower than CNNs in small size dataset like MoA and MNIST but are not\nnearly comparable as the dataset gets larger and more complex like the\nImageNet. The code implementation of this paper can be found on the link:\n\\href{https://github.com/ashimdahal/Study-of-Convolutional-Kolmogorov-Arnold-networks}{https://github.com/ashimdahal/Study-of-Convolutional-Kolmogorov-Arnold-networks}\n","authors":["Ashim Dahal","Saydul Akbar Murad","Nick Rahimi"],"pdf_url":"https://arxiv.org/pdf/2501.15757v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16684v1","updated":"2025-01-28T03:41:24Z","published":"2025-01-28T03:41:24Z","title":"SliceOcc: Indoor 3D Semantic Occupancy Prediction with Vertical Slice\n Representation","summary":" 3D semantic occupancy prediction is a crucial task in visual perception, as\nit requires the simultaneous comprehension of both scene geometry and\nsemantics. It plays a crucial role in understanding 3D scenes and has great\npotential for various applications, such as robotic vision perception and\nautonomous driving. Many existing works utilize planar-based representations\nsuch as Bird's Eye View (BEV) and Tri-Perspective View (TPV). These\nrepresentations aim to simplify the complexity of 3D scenes while preserving\nessential object information, thereby facilitating efficient scene\nrepresentation. However, in dense indoor environments with prevalent\nocclusions, directly applying these planar-based methods often leads to\ndifficulties in capturing global semantic occupancy, ultimately degrading model\nperformance. In this paper, we present a new vertical slice representation that\ndivides the scene along the vertical axis and projects spatial point features\nonto the nearest pair of parallel planes. To utilize these slice features, we\npropose SliceOcc, an RGB camera-based model specifically tailored for indoor 3D\nsemantic occupancy prediction. SliceOcc utilizes pairs of slice queries and\ncross-attention mechanisms to extract planar features from input images. These\nlocal planar features are then fused to form a global scene representation,\nwhich is employed for indoor occupancy prediction. Experimental results on the\nEmbodiedScan dataset demonstrate that SliceOcc achieves a mIoU of 15.45% across\n81 indoor categories, setting a new state-of-the-art performance among RGB\ncamera-based models for indoor 3D semantic occupancy prediction. Code is\navailable at https://github.com/NorthSummer/SliceOcc.\n","authors":["Jianing Li","Ming Lu","Hao Wang","Chenyang Gu","Wenzhao Zheng","Li Du","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.16684v1.pdf","comment":"Accepted by ICRA 2025;"},{"id":"http://arxiv.org/abs/2409.19221v2","updated":"2025-01-28T03:32:03Z","published":"2024-09-28T03:25:33Z","title":"Cauchy activation function and XNet","summary":" We have developed a novel activation function, named the Cauchy Activation\nFunction. This function is derived from the Cauchy Integral Theorem in complex\nanalysis and is specifically tailored for problems requiring high precision.\nThis innovation has led to the creation of a new class of neural networks,\nwhich we call (Comple)XNet, or simply XNet. We will demonstrate that XNet is\nparticularly effective for high-dimensional challenges such as image\nclassification and solving Partial Differential Equations (PDEs). Our\nevaluations show that XNet significantly outperforms established benchmarks\nlike MNIST and CIFAR-10 in computer vision, and offers substantial advantages\nover Physics-Informed Neural Networks (PINNs) in both low-dimensional and\nhigh-dimensional PDE scenarios.\n","authors":["Xin Li","Zhihong Xia","Hongkun Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.19221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16679v1","updated":"2025-01-28T03:25:37Z","published":"2025-01-28T03:25:37Z","title":"Polyp-Gen: Realistic and Diverse Polyp Image Generation for Endoscopic\n Dataset Expansion","summary":" Automated diagnostic systems (ADS) have shown significant potential in the\nearly detection of polyps during endoscopic examinations, thereby reducing the\nincidence of colorectal cancer. However, due to high annotation costs and\nstrict privacy concerns, acquiring high-quality endoscopic images poses a\nconsiderable challenge in the development of ADS. Despite recent advancements\nin generating synthetic images for dataset expansion, existing endoscopic image\ngeneration algorithms failed to accurately generate the details of polyp\nboundary regions and typically required medical priors to specify plausible\nlocations and shapes of polyps, which limited the realism and diversity of the\ngenerated images. To address these limitations, we present Polyp-Gen, the first\nfull-automatic diffusion-based endoscopic image generation framework.\nSpecifically, we devise a spatial-aware diffusion training scheme with a\nlesion-guided loss to enhance the structural context of polyp boundary regions.\nMoreover, to capture medical priors for the localization of potential polyp\nareas, we introduce a hierarchical retrieval-based sampling strategy to match\nsimilar fine-grained spatial features. In this way, our Polyp-Gen can generate\nrealistic and diverse endoscopic images for building reliable ADS. Extensive\nexperiments demonstrate the state-of-the-art generation quality, and the\nsynthetic images can improve the downstream polyp detection task. Additionally,\nour Polyp-Gen has shown remarkable zero-shot generalizability on other\ndatasets. The source code is available at\nhttps://github.com/CUHK-AIM-Group/Polyp-Gen.\n","authors":["Shengyuan Liu","Zhen Chen","Qiushi Yang","Weihao Yu","Di Dong","Jiancong Hu","Yixuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2501.16679v1.pdf","comment":"Accepted by ICRA 2025"},{"id":"http://arxiv.org/abs/2501.16677v1","updated":"2025-01-28T03:22:23Z","published":"2025-01-28T03:22:23Z","title":"Improving Interpretability and Accuracy in Neuro-Symbolic Rule\n Extraction Using Class-Specific Sparse Filters","summary":" There has been significant focus on creating neuro-symbolic models for\ninterpretable image classification using Convolutional Neural Networks (CNNs).\nThese methods aim to replace the CNN with a neuro-symbolic model consisting of\nthe CNN, which is used as a feature extractor, and an interpretable rule-set\nextracted from the CNN itself. While these approaches provide interpretability\nthrough the extracted rule-set, they often compromise accuracy compared to the\noriginal CNN model. In this paper, we identify the root cause of this accuracy\nloss as the post-training binarization of filter activations to extract the\nrule-set. To address this, we propose a novel sparsity loss function that\nenables class-specific filter binarization during CNN training, thus minimizing\ninformation loss when extracting the rule-set. We evaluate several training\nstrategies with our novel sparsity loss, analyzing their effectiveness and\nproviding guidance on their appropriate use. Notably, we set a new benchmark,\nachieving a 9% improvement in accuracy and a 53% reduction in rule-set size on\naverage, compared to the previous SOTA, while coming within 3% of the original\nCNN's accuracy. This highlights the significant potential of interpretable\nneuro-symbolic models as viable alternatives to black-box CNNs.\n","authors":["Parth Padalkar","Jaeseong Lee","Shiyi Wei","Gopal Gupta"],"pdf_url":"https://arxiv.org/pdf/2501.16677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16222v2","updated":"2025-01-28T03:15:52Z","published":"2025-01-27T17:13:03Z","title":"SPECIAL: Zero-shot Hyperspectral Image Classification With CLIP","summary":" Hyperspectral image (HSI) classification aims at categorizing each pixel in\nan HSI into a specific land cover class, which is crucial for applications like\nremote sensing, environmental monitoring, and agriculture. Although deep\nlearning-based HSI classification methods have achieved significant\nadvancements, existing methods still rely on manually labeled data for\ntraining, which is both time-consuming and labor-intensive. To address this\nlimitation, we introduce a novel zero-shot hyperspectral image classification\nframework based on CLIP (SPECIAL), aiming to eliminate the need for manual\nannotations. The SPECIAL framework consists of two main stages: (1) CLIP-based\npseudo-label generation, and (2) noisy label learning. In the first stage, HSI\nis spectrally interpolated to produce RGB bands. These bands are subsequently\nclassified using CLIP, resulting in noisy pseudo-labels that are accompanied by\nconfidence scores. To improve the quality of these labels, we propose a scaling\nstrategy that fuses predictions from multiple spatial scales. In the second\nstage, spectral information and a label refinement technique are incorporated\nto mitigate label noise and further enhance classification accuracy.\nExperimental results on three benchmark datasets demonstrate that our SPECIAL\noutperforms existing methods in zero-shot HSI classification, showing its\npotential for more practical applications. The code is available at\nhttps://github.com/LiPang/SPECIAL.\n","authors":["Li Pang","Jing Yao","Kaiyu Li","Xiangyong Cao"],"pdf_url":"https://arxiv.org/pdf/2501.16222v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.15151v2","updated":"2025-01-28T03:08:59Z","published":"2025-01-25T09:24:14Z","title":"SpikSSD: Better Extraction and Fusion for Object Detection with Spiking\n Neuron Networks","summary":" As the third generation of neural networks, Spiking Neural Networks (SNNs)\nhave gained widespread attention due to their low energy consumption and\nbiological interpretability. Recently, SNNs have made considerable advancements\nin computer vision. However, efficiently conducting feature extraction and\nfusion under the spiking characteristics of SNNs for object detection remains a\npressing challenge. To address this problem, we propose the SpikSSD, a novel\nSpiking Single Shot Multibox Detector. Specifically, we design a full-spiking\nbackbone network, MDS-ResNet, which effectively adjusts the membrane synaptic\ninput distribution at each layer, achieving better spiking feature extraction.\nAdditionally, for spiking feature fusion, we introduce the Spiking Bi-direction\nFusion Module (SBFM), which for the first time realizes bi-direction fusion of\nspiking features, enhancing the multi-scale detection capability of the model.\nExperimental results show that SpikSSD achieves 40.8% mAP on the GEN1 dataset,\n76.3% and 52.4% mAP@0.5 on VOC 2007 and COCO 2017 datasets respectively with\nthe lowest firing rate, outperforming existing SNN-based approaches at ultralow\nenergy consumption. This work sets a new benchmark for future research in\nSNN-based object detection. Our code is publicly available in\nhttps://github.com/yimeng-fan/SpikSSD.\n","authors":["Yimeng Fan","Changsong Liu","Mingyang Li","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.15151v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16665v1","updated":"2025-01-28T03:04:22Z","published":"2025-01-28T03:04:22Z","title":"CSPCL: Category Semantic Prior Contrastive Learning for Deformable\n DETR-Based Prohibited Item Detectors","summary":" Prohibited item detection based on X-ray images is one of the most effective\nsecurity inspection methods. However, the foreground-background feature\ncoupling caused by the overlapping phenomenon specific to X-ray images makes\ngeneral detectors designed for natural images perform poorly. To address this\nissue, we propose a Category Semantic Prior Contrastive Learning (CSPCL)\nmechanism, which aligns the class prototypes perceived by the classifier with\nthe content queries to correct and supplement the missing semantic information\nresponsible for classification, thereby enhancing the model sensitivity to\nforeground features.To achieve this alignment, we design a specific contrastive\nloss, CSP loss, which includes Intra-Class Truncated Attraction (ITA) loss and\nInter-Class Adaptive Repulsion (IAR) loss, and outperforms classic N-pair loss\nand InfoNCE loss. Specifically, ITA loss leverages class prototypes to attract\nintra-class category-specific content queries while preserving necessary\ndistinctiveness. IAR loss utilizes class prototypes to adaptively repel\ninter-class category-specific content queries based on the similarity between\nclass prototypes, helping disentangle features of similar categories.CSPCL is\ngeneral and can be easily integrated into Deformable DETR-based models.\nExtensive experiments on the PIXray and OPIXray datasets demonstrate that CSPCL\nsignificantly enhances the performance of various state-of-the-art models\nwithout increasing complexity.The code will be open source once the paper is\naccepted.\n","authors":["Mingyuan Li","Tong Jia","Hui Lu","Bowen Ma","Hao Wang","Dongyue Chen"],"pdf_url":"https://arxiv.org/pdf/2501.16665v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2501.16664v1","updated":"2025-01-28T02:53:48Z","published":"2025-01-28T02:53:48Z","title":"Improving Vision-Language-Action Model with Online Reinforcement\n Learning","summary":" Recent studies have successfully integrated large vision-language models\n(VLMs) into low-level robotic control by supervised fine-tuning (SFT) with\nexpert robotic datasets, resulting in what we term vision-language-action (VLA)\nmodels. Although the VLA models are powerful, how to improve these large models\nduring interaction with environments remains an open question. In this paper,\nwe explore how to further improve these VLA models via Reinforcement Learning\n(RL), a commonly used fine-tuning technique for large models. However, we find\nthat directly applying online RL to large VLA models presents significant\nchallenges, including training instability that severely impacts the\nperformance of large models, and computing burdens that exceed the capabilities\nof most local machines. To address these challenges, we propose iRe-VLA\nframework, which iterates between Reinforcement Learning and Supervised\nLearning to effectively improve VLA models, leveraging the exploratory benefits\nof RL while maintaining the stability of supervised learning. Experiments in\ntwo simulated benchmarks and a real-world manipulation suite validate the\neffectiveness of our method.\n","authors":["Yanjiang Guo","Jianke Zhang","Xiaoyu Chen","Xiang Ji","Yen-Jen Wang","Yucheng Hu","Jianyu Chen"],"pdf_url":"https://arxiv.org/pdf/2501.16664v1.pdf","comment":"Accepted to ICRA 2025"},{"id":"http://arxiv.org/abs/2501.16662v1","updated":"2025-01-28T02:52:04Z","published":"2025-01-28T02:52:04Z","title":"Vision-based autonomous structural damage detection using data-driven\n methods","summary":" This study addresses the urgent need for efficient and accurate damage\ndetection in wind turbine structures, a crucial component of renewable energy\ninfrastructure. Traditional inspection methods, such as manual assessments and\nnon-destructive testing (NDT), are often costly, time-consuming, and prone to\nhuman error. To tackle these challenges, this research investigates advanced\ndeep learning algorithms for vision-based structural health monitoring (SHM). A\ndataset of wind turbine surface images, featuring various damage types and\npollution, was prepared and augmented for enhanced model training. Three\nalgorithms-YOLOv7, its lightweight variant, and Faster R-CNN- were employed to\ndetect and classify surface damage. The models were trained and evaluated on a\ndataset split into training, testing, and evaluation subsets (80%-10%-10%).\nResults indicate that YOLOv7 outperformed the others, achieving 82.4% mAP@50\nand high processing speed, making it suitable for real-time inspections. By\noptimizing hyperparameters like learning rate and batch size, the models'\naccuracy and efficiency improved further. YOLOv7 demonstrated significant\nadvancements in detection precision and execution speed, especially for\nreal-time applications. However, challenges such as dataset limitations and\nenvironmental variability were noted, suggesting future work on segmentation\nmethods and larger datasets. This research underscores the potential of\nvision-based deep learning techniques to transform SHM practices by reducing\ncosts, enhancing safety, and improving reliability, thus contributing to the\nsustainable maintenance of critical infrastructure and supporting the longevity\nof wind energy systems.\n","authors":["Seyyed Taghi Ataei","Parviz Mohammad Zadeh","Saeid Ataei"],"pdf_url":"https://arxiv.org/pdf/2501.16662v1.pdf","comment":"14 pages, 8 figures. This study examines advanced deep learning\n algorithms, specifically YOLOv7, for efficient and accurate damage detection\n in wind turbine structures. It significantly enhances detection precision and\n speed for real-time inspections"},{"id":"http://arxiv.org/abs/2501.16652v1","updated":"2025-01-28T02:35:02Z","published":"2025-01-28T02:35:02Z","title":"Molecular-driven Foundation Model for Oncologic Pathology","summary":" Foundation models are reshaping computational pathology by enabling transfer\nlearning, where models pre-trained on vast datasets can be adapted for\ndownstream diagnostic, prognostic, and therapeutic response tasks. Despite\nthese advances, foundation models are still limited in their ability to encode\nthe entire gigapixel whole-slide images without additional training and often\nlack complementary multimodal data. Here, we introduce Threads, a slide-level\nfoundation model capable of generating universal representations of whole-slide\nimages of any size. Threads was pre-trained using a multimodal learning\napproach on a diverse cohort of 47,171 hematoxylin and eosin (H&E)-stained\ntissue sections, paired with corresponding genomic and transcriptomic profiles\n- the largest such paired dataset to be used for foundation model development\nto date. This unique training paradigm enables Threads to capture the tissue's\nunderlying molecular composition, yielding powerful representations applicable\nto a wide array of downstream tasks. In extensive benchmarking across 54\noncology tasks, including clinical subtyping, grading, mutation prediction,\nimmunohistochemistry status determination, treatment response prediction, and\nsurvival prediction, Threads outperformed all baselines while demonstrating\nremarkable generalizability and label efficiency. It is particularly well\nsuited for predicting rare events, further emphasizing its clinical utility. We\nintend to make the model publicly available for the broader community.\n","authors":["Anurag Vaidya","Andrew Zhang","Guillaume Jaume","Andrew H. Song","Tong Ding","Sophia J. Wagner","Ming Y. Lu","Paul Doucet","Harry Robertson","Cristina Almagro-Perez","Richard J. Chen","Dina ElHarouni","Georges Ayoub","Connor Bossi","Keith L. Ligon","Georg Gerber","Long Phi Le","Faisal Mahmood"],"pdf_url":"https://arxiv.org/pdf/2501.16652v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01826v3","updated":"2025-01-28T02:31:02Z","published":"2024-08-03T17:18:26Z","title":"GLDiTalker: Speech-Driven 3D Facial Animation with Graph Latent\n Diffusion Transformer","summary":" Speech-driven talking head generation is a critical yet challenging task with\napplications in augmented reality and virtual human modeling. While recent\napproaches using autoregressive and diffusion-based models have achieved\nnotable progress, they often suffer from modality inconsistencies, particularly\nmisalignment between audio and mesh, leading to reduced motion diversity and\nlip-sync accuracy. To address this, we propose GLDiTalker, a novel\nspeech-driven 3D facial animation model based on a Graph Latent Diffusion\nTransformer. GLDiTalker resolves modality misalignment by diffusing signals\nwithin a quantized spatiotemporal latent space. It employs a two-stage training\npipeline: the Graph-Enhanced Quantized Space Learning Stage ensures lip-sync\naccuracy, while the Space-Time Powered Latent Diffusion Stage enhances motion\ndiversity. Together, these stages enable GLDiTalker to generate realistic,\ntemporally stable 3D facial animations. Extensive evaluations on standard\nbenchmarks demonstrate that GLDiTalker outperforms existing methods, achieving\nsuperior results in both lip-sync accuracy and motion diversity.\n","authors":["Yihong Lin","Zhaoxin Fan","Xianjia Wu","Lingyu Xiong","Liang Peng","Xiandong Li","Wenxiong Kang","Songju Lei","Huang Xu"],"pdf_url":"https://arxiv.org/pdf/2408.01826v3.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.11518v2","updated":"2025-01-28T02:29:29Z","published":"2024-08-21T10:51:12Z","title":"EmoFace: Emotion-Content Disentangled Speech-Driven 3D Talking Face\n Animation","summary":" The creation of increasingly vivid 3D talking face has become a hot topic in\nrecent years. Currently, most speech-driven works focus on lip synchronisation\nbut neglect to effectively capture the correlations between emotions and facial\nmotions. To address this problem, we propose a two-stream network called\nEmoFace, which consists of an emotion branch and a content branch. EmoFace\nemploys a novel Mesh Attention mechanism to analyse and fuse the emotion\nfeatures and content features. Particularly, a newly designed spatio-temporal\ngraph-based convolution, SpiralConv3D, is used in Mesh Attention to learn\npotential temporal and spatial feature dependencies between mesh vertices. In\naddition, to the best of our knowledge, it is the first time to introduce a new\nself-growing training scheme with intermediate supervision to dynamically\nadjust the ratio of groundtruth adopted in the 3D face animation task.\nComprehensive quantitative and qualitative evaluations on our high-quality 3D\nemotional facial animation dataset, 3D-RAVDESS ($4.8863\\times 10^{-5}$mm for\nLVE and $0.9509\\times 10^{-5}$mm for EVE), together with the public dataset\nVOCASET ($2.8669\\times 10^{-5}$mm for LVE and $0.4664\\times 10^{-5}$mm for\nEVE), demonstrate that our approach achieves state-of-the-art performance.\n","authors":["Yihong Lin","Liang Peng","Xianjia Wu","Jianqiao Hu","Xiandong Li","Wenxiong Kang","Songju Lei","Huang Xu"],"pdf_url":"https://arxiv.org/pdf/2408.11518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.13341v3","updated":"2025-01-28T02:12:59Z","published":"2025-01-23T02:45:35Z","title":"Multi-aspect Knowledge Distillation with Large Language Model","summary":" Recent advancements in deep learning have significantly improved performance\non computer vision tasks. Previous image classification methods primarily\nmodify model architectures or add features, and they optimize models using\ncross-entropy loss on class logits. Since they focus on classifying images with\nconsidering class labels, these methods may struggle to learn various\n\\emph{aspects} of classes (e.g., natural positions and shape changes).\nRethinking the previous approach from a novel view, we propose a multi-aspect\nknowledge distillation method using Multimodal Large Language Models (MLLMs).\nOur approach involves: 1) querying Large Language Model with multi-aspect\nquestions relevant to the knowledge we want to transfer to the model, 2)\nextracting corresponding logits from MLLM, and 3) expanding the model's output\ndimensions to distill these multi-aspect logits. We then apply cross-entropy\nloss to class logits and binary cross-entropy loss to multi-aspect logits.\nThrough our method, the model can learn not only the knowledge about visual\naspects but also the abstract and complex aspects that require a deeper\nunderstanding. We primarily apply our method to image classification, and to\nexplore the potential for extending our model, we expand it to other tasks,\nsuch as object detection. In all experimental results, our method improves the\nperformance of the baselines. Additionally, we analyze the effect of\nmulti-aspect knowledge distillation. These results demonstrate that our method\ncan transfer knowledge about various aspects to the model and the aspect\nknowledge can enhance model performance in computer vision tasks. This paper\ndemonstrates the great potential of multi-aspect knowledge distillation, and we\nbelieve it offers a promising direction for future research in computer vision\nand beyond.\n","authors":["Taegyeong Lee","Jinsik Bang","Soyeong Kwon","Taehwan Kim"],"pdf_url":"https://arxiv.org/pdf/2501.13341v3.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2501.16629v1","updated":"2025-01-28T02:05:38Z","published":"2025-01-28T02:05:38Z","title":"CHiP: Cross-modal Hierarchical Direct Preference Optimization for\n Multimodal LLMs","summary":" Multimodal Large Language Models (MLLMs) still struggle with hallucinations\ndespite their impressive capabilities. Recent studies have attempted to\nmitigate this by applying Direct Preference Optimization (DPO) to multimodal\nscenarios using preference pairs from text-based responses. However, our\nanalysis of representation distributions reveals that multimodal DPO struggles\nto align image and text representations and to distinguish between hallucinated\nand non-hallucinated descriptions. To address these challenges, in this work,\nwe propose a Cross-modal Hierarchical Direct Preference Optimization (CHiP) to\naddress these limitations. We introduce a visual preference optimization module\nwithin the DPO framework, enabling MLLMs to learn from both textual and visual\npreferences simultaneously. Furthermore, we propose a hierarchical textual\npreference optimization module that allows the model to capture preferences at\nmultiple granular levels, including response, segment, and token levels. We\nevaluate CHiP through both quantitative and qualitative analyses, with results\nacross multiple benchmarks demonstrating its effectiveness in reducing\nhallucinations. On the Object HalBench dataset, CHiP outperforms DPO in\nhallucination reduction, achieving improvements of 52.7% and 55.5% relative\npoints based on the base model Muffin and LLaVA models, respectively. We make\nall our datasets and code publicly available: https://github.com/LVUGAI/CHiP.\n","authors":["Jinlan Fu","Shenzhen Huangfu","Hao Fei","Xiaoyu Shen","Bryan Hooi","Xipeng Qiu","See-Kiong Ng"],"pdf_url":"https://arxiv.org/pdf/2501.16629v1.pdf","comment":"Accepted by ICLR 2025"},{"id":"http://arxiv.org/abs/2501.16617v1","updated":"2025-01-28T01:31:15Z","published":"2025-01-28T01:31:15Z","title":"Predicting 3D representations for Dynamic Scenes","summary":" We present a novel framework for dynamic radiance field prediction given\nmonocular video streams. Unlike previous methods that primarily focus on\npredicting future frames, our method goes a step further by generating explicit\n3D representations of the dynamic scene. The framework builds on two core\ndesigns. First, we adopt an ego-centric unbounded triplane to explicitly\nrepresent the dynamic physical world. Second, we develop a 4D-aware transformer\nto aggregate features from monocular videos to update the triplane. Coupling\nthese two designs enables us to train the proposed model with large-scale\nmonocular videos in a self-supervised manner. Our model achieves top results in\ndynamic radiance field prediction on NVIDIA dynamic scenes, demonstrating its\nstrong performance on 4D physical world modeling. Besides, our model shows a\nsuperior generalizability to unseen scenarios. Notably, we find that our\napproach emerges capabilities for geometry and semantic learning.\n","authors":["Di Qi","Tong Yang","Beining Wang","Xiangyu Zhang","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.16617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15268v8","updated":"2025-01-28T01:18:32Z","published":"2023-12-23T14:36:27Z","title":"Manydepth2: Motion-Aware Self-Supervised Multi-Frame Monocular Depth\n Estimation in Dynamic Scenes","summary":" Despite advancements in self-supervised monocular depth estimation,\nchallenges persist in dynamic scenarios due to the dependence on assumptions\nabout a static world. In this paper, we present Manydepth2, to achieve precise\ndepth estimation for both dynamic objects and static backgrounds, all while\nmaintaining computational efficiency. To tackle the challenges posed by dynamic\ncontent, we incorporate optical flow and coarse monocular depth to create a\npseudo-static reference frame. This frame is then utilized to build a\nmotion-aware cost volume in collaboration with the vanilla target frame.\nFurthermore, to improve the accuracy and robustness of the network\narchitecture, we propose an attention-based depth network that effectively\nintegrates information from feature maps at different resolutions by\nincorporating both channel and non-local attention mechanisms. Compared to\nmethods with similar computational costs, Manydepth2 achieves a significant\nreduction of approximately five percent in root-mean-square error for\nself-supervised monocular depth estimation on the KITTI-2015 dataset. The code\ncould be found at https://github.com/kaichen-z/Manydepth2.\n","authors":["Kaichen Zhou","Jia-Wang Bian","Jian-Qing Zheng","Jiaxing Zhong","Qian Xie","Niki Trigoni","Andrew Markham"],"pdf_url":"https://arxiv.org/pdf/2312.15268v8.pdf","comment":"Monocular Depth Estimation, Self-Supervised, Optical Flow"},{"id":"http://arxiv.org/abs/2501.16612v1","updated":"2025-01-28T01:14:24Z","published":"2025-01-28T01:14:24Z","title":"CascadeV: An Implementation of Wurstchen Architecture for Video\n Generation","summary":" Recently, with the tremendous success of diffusion models in the field of\ntext-to-image (T2I) generation, increasing attention has been directed toward\ntheir potential in text-to-video (T2V) applications. However, the computational\ndemands of diffusion models pose significant challenges, particularly in\ngenerating high-resolution videos with high frame rates. In this paper, we\npropose CascadeV, a cascaded latent diffusion model (LDM), that is capable of\nproducing state-of-the-art 2K resolution videos. Experiments demonstrate that\nour cascaded model achieves a higher compression ratio, substantially reducing\nthe computational challenges associated with high-quality video generation. We\nalso implement a spatiotemporal alternating grid 3D attention mechanism, which\neffectively integrates spatial and temporal information, ensuring superior\nconsistency across the generated video frames. Furthermore, our model can be\ncascaded with existing T2V models, theoretically enabling a 4$\\times$ increase\nin resolution or frames per second without any fine-tuning. Our code is\navailable at https://github.com/bytedance/CascadeV.\n","authors":["Wenfeng Lin","Jiangchuan Wei","Boyuan Liu","Yichen Zhang","Shiyue Yan","Mingyu Guo"],"pdf_url":"https://arxiv.org/pdf/2501.16612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.16608v1","updated":"2025-01-28T00:55:07Z","published":"2025-01-28T00:55:07Z","title":"Unsupervised Domain Adaptation with Dynamic Clustering and Contrastive\n Refinement for Gait Recognition","summary":" Gait recognition is an emerging identification technology that distinguishes\nindividuals at long distances by analyzing individual walking patterns.\nTraditional techniques rely heavily on large-scale labeled datasets, which\nincurs high costs and significant labeling challenges. Recently, researchers\nhave explored unsupervised gait recognition with clustering-based unsupervised\ndomain adaptation methods and achieved notable success. However, these methods\ndirectly use pseudo-label generated by clustering and neglect pseudolabel noise\ncaused by domain differences, which affects the effect of the model training\nprocess. To mitigate these issues, we proposed a novel model called GaitDCCR,\nwhich aims to reduce the influence of noisy pseudo labels on clustering and\nmodel training. Our approach can be divided into two main stages: clustering\nand training stage. In the clustering stage, we propose Dynamic Cluster\nParameters (DCP) and Dynamic Weight Centroids (DWC) to improve the efficiency\nof clustering and obtain reliable cluster centroids. In the training stage, we\nemploy the classical teacher-student structure and propose Confidence-based\nPseudo-label Refinement (CPR) and Contrastive Teacher Module (CTM) to encourage\nnoisy samples to converge towards clusters containing their true identities.\nExtensive experiments on public gait datasets have demonstrated that our simple\nand effective method significantly enhances the performance of unsupervised\ngait recognition, laying the foundation for its application in the\nreal-world.The code is available at https://github.com/YanSun-github/GaitDCCR\n","authors":["Xiaolei Liu","Yan Sun","Mark Nixon"],"pdf_url":"https://arxiv.org/pdf/2501.16608v1.pdf","comment":"21 pages, 8 figures"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Robotics 27 + +
+
+
+ + ☆ Mobile Manipulation Instruction Generation from Multiple Images with + Automatic Metric Enhancement RA-L 2025 + + +
+ We consider the problem of generating free-form mobile manipulation +instructions based on a target object image and receptacle image. Conventional +image captioning models are not able to generate appropriate instructions +because their architectures are typically optimized for single-image. In this +study, we propose a model that handles both the target object and receptacle to +generate free-form instruction sentences for mobile manipulation tasks. +Moreover, we introduce a novel training method that effectively incorporates +the scores from both learning-based and n-gram based automatic evaluation +metrics as rewards. This method enables the model to learn the co-occurrence +relationships between words and appropriate paraphrases. Results demonstrate +that our proposed method outperforms baseline methods including representative +multimodal large language models on standard automatic evaluation metrics. +Moreover, physical experiments reveal that using our method to augment data on +language instructions improves the performance of an existing multimodal +language understanding model for mobile manipulation. + +
+
+ comment: Accepted for IEEE RA-L 2025 +
+
+
+
+
+ + ☆ Six-Degree-of-Freedom Motion Emulation for Data-Driven Modeling of + Underwater Vehicles + + +
+ This article presents a collaborative research effort aimed at developing a +novel six-degree-of-freedom (6-DOF) motion platform for the empirical +characterization of hydrodynamic forces crucial for the control and stability +of surface and subsurface vehicles. Traditional experimental methods, such as +the Planar Motion Mechanism (PMM), are limited by the number of simultaneously +articulated DOFs and are limited to single-frequency testing, making such +systems impractical for resolving frequency-dependent added mass or damping +matrices. The 6 DOF platform, termed a hexapod, overcomes these limitations by +offering enhanced maneuverability and the ability to test broad-banded +frequency spectra in multiple degrees of freedom in a single experiment. + +
+
+
+
+
+ + ☆ Revisit Mixture Models for Multi-Agent Simulation: Experimental Study + within a Unified Framework + + +
+ Simulation plays a crucial role in assessing autonomous driving systems, +where the generation of realistic multi-agent behaviors is a key aspect. In +multi-agent simulation, the primary challenges include behavioral multimodality +and closed-loop distributional shifts. In this study, we revisit mixture models +for generating multimodal agent behaviors, which can cover the mainstream +methods including continuous mixture models and GPT-like discrete models. +Furthermore, we introduce a closed-loop sample generation approach tailored for +mixture models to mitigate distributional shifts. Within the unified mixture +model~(UniMM) framework, we recognize critical configurations from both model +and data perspectives. We conduct a systematic examination of various model +configurations, including positive component matching, continuous regression, +prediction horizon, and the number of components. Moreover, our investigation +into the data configuration highlights the pivotal role of closed-loop samples +in achieving realistic simulations. To extend the benefits of closed-loop +samples across a broader range of mixture models, we further address the +shortcut learning and off-policy learning issues. Leveraging insights from our +exploration, the distinct variants proposed within the UniMM framework, +including discrete, anchor-free, and anchor-based models, all achieve +state-of-the-art performance on the WOSAC benchmark. + +
+
+
+
+
+ + ☆ MAUCell: An Adaptive Multi-Attention Framework for Video Frame + Prediction + + +
+ Temporal sequence modeling stands as the fundamental foundation for video +prediction systems and real-time forecasting operations as well as anomaly +detection applications. The achievement of accurate predictions through +efficient resource consumption remains an ongoing issue in contemporary +temporal sequence modeling. We introduce the Multi-Attention Unit (MAUCell) +which combines Generative Adversarial Networks (GANs) and spatio-temporal +attention mechanisms to improve video frame prediction capabilities. Our +approach implements three types of attention models to capture intricate motion +sequences. A dynamic combination of these attention outputs allows the model to +reach both advanced decision accuracy along with superior quality while +remaining computationally efficient. The integration of GAN elements makes +generated frames appear more true to life therefore the framework creates +output sequences which mimic real-world footage. The new design system +maintains equilibrium between temporal continuity and spatial accuracy to +deliver reliable video prediction. Through a comprehensive evaluation +methodology which merged the perceptual LPIPS measurement together with classic +tests MSE, MAE, SSIM and PSNR exhibited enhancing capabilities than +contemporary approaches based on direct benchmark tests of Moving MNIST, KTH +Action, and CASIA-B (Preprocessed) datasets. Our examination indicates that +MAUCell shows promise for operational time requirements. The research findings +demonstrate how GANs work best with attention mechanisms to create better +applications for predicting video sequences. + +
+
+ comment: This work has been submitted to the IJCAI 2025 Conference for review. + It contains: 11 pages, 4 figures, 7 tables, and 3 Algorithms +
+
+
+
+
+ + ☆ Towards Open-Source and Modular Space Systems with ATMOS + + +
+ In the near future, autonomous space systems will compose a large number of +the spacecraft being deployed. Their tasks will involve autonomous rendezvous +and proximity operations with large structures, such as inspections or assembly +of orbiting space stations and maintenance and human-assistance tasks over +shared workspaces. To promote replicable and reliable scientific results for +autonomous control of spacecraft, we present the design of a space systems +laboratory based on open-source and modular software and hardware. The +simulation software provides a software-in-the-loop (SITL) architecture that +seamlessly transfers simulated results to the ATMOS platforms, developed for +testing of multi-agent autonomy schemes for microgravity. The manuscript +presents the KTH space systems laboratory facilities and the ATMOS platform as +open-source hardware and software contributions. Preliminary results showcase +SITL and real testing. + +
+
+ comment: Preliminary release, to be submitted +
+
+
+
+
+ + ☆ Image-based Geo-localization for Robotics: Are Black-box Vision-Language + Models there yet? IROS 2025 + + +
+ The advances in Vision-Language models (VLMs) offer exciting opportunities +for robotic applications involving image geo-localization, the problem of +identifying the geo-coordinates of a place based on visual data only. Recent +research works have focused on using a VLM as embeddings extractor for +geo-localization, however, the most sophisticated VLMs may only be available as +black boxes that are accessible through an API, and come with a number of +limitations: there is no access to training data, model features and gradients; +retraining is not possible; the number of predictions may be limited by the +API; training on model outputs is often prohibited; and queries are open-ended. +The utilization of a VLM as a stand-alone, zero-shot geo-localization system +using a single text-based prompt is largely unexplored. To bridge this gap, +this paper undertakes the first systematic study, to the best of our knowledge, +to investigate the potential of some of the state-of-the-art VLMs as +stand-alone, zero-shot geo-localization systems in a black-box setting with +realistic constraints. We consider three main scenarios for this thorough +investigation: a) fixed text-based prompt; b) semantically-equivalent +text-based prompts; and c) semantically-equivalent query images. We also take +into account the auto-regressive and probabilistic generation process of the +VLMs when investigating their utility for geo-localization task by using model +consistency as a metric in addition to traditional accuracy. Our work provides +new insights in the capabilities of different VLMs for the above-mentioned +scenarios. + +
+
+ comment: Submitted to IROS 2025 +
+
+
+
+
+ + ☆ Giving Sense to Inputs: Toward an Accessible Control Framework for + Shared Autonomy + + +
+ While shared autonomy offers significant potential for assistive robotics, +key questions remain about how to effectively map 2D control inputs to 6D robot +motions. An intuitive framework should allow users to input commands +effortlessly, with the robot responding as expected, without users needing to +anticipate the impact of their inputs. In this article, we propose a dynamic +input mapping framework that links joystick movements to motions on control +frames defined along a trajectory encoded with canal surfaces. We evaluate our +method in a user study with 20 participants, demonstrating that our input +mapping framework reduces the workload and improves usability compared to a +baseline mapping with similar motion encoding. To prepare for deployment in +assistive scenarios, we built on the development from the accessible gaming +community to select an accessible control interface. We then tested the system +in an exploratory study, where three wheelchair users controlled the robot for +both daily living activities and a creative painting task, demonstrating its +feasibility for users closer to our target population. + +
+
+
+
+
+ + ☆ RDMM: Fine-Tuned LLM Models for On-Device Robotic Decision Making with + Enhanced Contextual Awareness in Specific Domains + + +
+ Large language models (LLMs) represent a significant advancement in +integrating physical robots with AI-driven systems. We showcase the +capabilities of our framework within the context of the real-world household +competition. This research introduces a framework that utilizes RDMM (Robotics +Decision-Making Models), which possess the capacity for decision-making within +domain-specific contexts, as well as an awareness of their personal knowledge +and capabilities. The framework leverages information to enhance the autonomous +decision-making of the system. In contrast to other approaches, our focus is on +real-time, on-device solutions, successfully operating on hardware with as +little as 8GB of memory. Our framework incorporates visual perception models +equipping robots with understanding of their environment. Additionally, the +framework has integrated real-time speech recognition capabilities, thus +enhancing the human-robot interaction experience. Experimental results +demonstrate that the RDMM framework can plan with an 93\% accuracy. +Furthermore, we introduce a new dataset consisting of 27k planning instances, +as well as 1.3k text-image annotated samples derived from the competition. The +framework, benchmarks, datasets, and models developed in this work are publicly +available on our GitHub repository at https://github.com/shadynasrat/RDMM. + +
+
+
+
+
+ + ☆ Event-Based Adaptive Koopman Framework for Optic Flow-Guided Landing on + Moving Platforms + + +
+ This paper presents an optic flow-guided approach for achieving soft landings +by resource-constrained unmanned aerial vehicles (UAVs) on dynamic platforms. +An offline data-driven linear model based on Koopman operator theory is +developed to describe the underlying (nonlinear) dynamics of optic flow output +obtained from a single monocular camera that maps to vehicle acceleration as +the control input. Moreover, a novel adaptation scheme within the Koopman +framework is introduced online to handle uncertainties such as unknown platform +motion and ground effect, which exert a significant influence during the +terminal stage of the descent process. Further, to minimize computational +overhead, an event-based adaptation trigger is incorporated into an +event-driven Model Predictive Control (MPC) strategy to regulate optic flow and +track a desired reference. A detailed convergence analysis ensures global +convergence of the tracking error to a uniform ultimate bound while ensuring +Zeno-free behavior. Simulation results demonstrate the algorithm's robustness +and effectiveness in landing on dynamic platforms under ground effect and +sensor noise, which compares favorably to non-adaptive event-triggered and +time-triggered adaptive schemes. + +
+
+
+
+
+ + ☆ RG-Attn: Radian Glue Attention for Multi-modality Multi-agent + Cooperative Perception + + +
+ Cooperative perception offers an optimal solution to overcome the perception +limitations of single-agent systems by leveraging Vehicle-to-Everything (V2X) +communication for data sharing and fusion across multiple agents. However, most +existing approaches focus on single-modality data exchange, limiting the +potential of both homogeneous and heterogeneous fusion across agents. This +overlooks the opportunity to utilize multi-modality data per agent, restricting +the system's performance. In the automotive industry, manufacturers adopt +diverse sensor configurations, resulting in heterogeneous combinations of +sensor modalities across agents. To harness the potential of every possible +data source for optimal performance, we design a robust LiDAR and camera +cross-modality fusion module, Radian-Glue-Attention (RG-Attn), applicable to +both intra-agent cross-modality fusion and inter-agent cross-modality fusion +scenarios, owing to the convenient coordinate conversion by transformation +matrix and the unified sampling/inversion mechanism. We also propose two +different architectures, named Paint-To-Puzzle (PTP) and +Co-Sketching-Co-Coloring (CoS-CoCo), for conducting cooperative perception. PTP +aims for maximum precision performance and achieves smaller data packet size by +limiting cross-agent fusion to a single instance, but requiring all +participants to be equipped with LiDAR. In contrast, CoS-CoCo supports agents +with any configuration-LiDAR-only, camera-only, or LiDAR-camera-both, +presenting more generalization ability. Our approach achieves state-of-the-art +(SOTA) performance on both real and simulated cooperative perception datasets. +The code will be released at GitHub in early 2025. + +
+
+
+
+
+ + ☆ DIRIGENt: End-To-End Robotic Imitation of Human Demonstrations Based on + a Diffusion Model + + +
+ There has been substantial progress in humanoid robots, with new skills +continuously being taught, ranging from navigation to manipulation. While these +abilities may seem impressive, the teaching methods often remain inefficient. +To enhance the process of teaching robots, we propose leveraging a mechanism +effectively used by humans: teaching by demonstrating. In this paper, we +introduce DIRIGENt (DIrect Robotic Imitation GENeration model), a novel +end-to-end diffusion approach that directly generates joint values from +observing human demonstrations, enabling a robot to imitate these actions +without any existing mapping between it and humans. We create a dataset in +which humans imitate a robot and then use this collected data to train a +diffusion model that enables a robot to imitate humans. The following three +aspects are the core of our contribution. First is our novel dataset with +natural pairs between human and robot poses, allowing our approach to imitate +humans accurately despite the gap between their anatomies. Second, the +diffusion input to our model alleviates the challenge of redundant joint +configurations, limiting the search space. And finally, our end-to-end +architecture from perception to action leads to an improved learning +capability. Through our experimental analysis, we show that combining these +three aspects allows DIRIGENt to outperform existing state-of-the-art +approaches in the field of generating joint values from RGB images. + +
+
+
+
+
+ + ☆ SSF-PAN: Semantic Scene Flow-Based Perception for Autonomous Navigation + in Traffic Scenarios + + +
+ Vehicle detection and localization in complex traffic scenarios pose +significant challenges due to the interference of moving objects. Traditional +methods often rely on outlier exclusions or semantic segmentations, which +suffer from low computational efficiency and accuracy. The proposed SSF-PAN can +achieve the functionalities of LiDAR point cloud based object +detection/localization and SLAM (Simultaneous Localization and Mapping) with +high computational efficiency and accuracy, enabling map-free navigation +frameworks. The novelty of this work is threefold: 1) developing a neural +network which can achieve segmentation among static and dynamic objects within +the scene flows with different motion features, that is, semantic scene flow +(SSF); 2) developing an iterative framework which can further optimize the +quality of input scene flows and output segmentation results; 3) developing a +scene flow-based navigation platform which can test the performance of the SSF +perception system in the simulation environment. The proposed SSF-PAN method is +validated using the SUScape-CARLA and the KITTI datasets, as well as on the +CARLA simulator. Experimental results demonstrate that the proposed approach +outperforms traditional methods in terms of scene flow computation accuracy, +moving object detection accuracy, computational efficiency, and autonomous +navigation effectiveness. + +
+
+
+
+
+ + ☆ Hierarchical Trajectory (Re)Planning for a Large Scale Swarm + + +
+ We consider the trajectory replanning problem for a large-scale swarm in a +cluttered environment. Our path planner replans for robots by utilizing a +hierarchical approach, dividing the workspace, and computing collision-free +paths for robots within each cell in parallel. Distributed trajectory +optimization generates a deadlock-free trajectory for efficient execution and +maintains the control feasibility even when the optimization fails. Our +hierarchical approach combines the benefits of both centralized and +decentralized methods, achieving a high task success rate while providing +real-time replanning capability. Compared to decentralized approaches, our +approach effectively avoids deadlocks and collisions, significantly increasing +the task success rate. We demonstrate the real-time performance of our +algorithm with up to 142 robots in simulation, and a representative 24 physical +Crazyflie nano-quadrotor experiment. + +
+
+ comment: 13 pages, 14 figures. arXiv admin note: substantial text overlap with + arXiv:2407.02777 +
+
+
+
+
+ + ☆ Dream to Drive with Predictive Individual World Model + + +
+ It is still a challenging topic to make reactive driving behaviors in complex +urban environments as road users' intentions are unknown. Model-based +reinforcement learning (MBRL) offers great potential to learn a reactive policy +by constructing a world model that can provide informative states and +imagination training. However, a critical limitation in relevant research lies +in the scene-level reconstruction representation learning, which may overlook +key interactive vehicles and hardly model the interactive features among +vehicles and their long-term intentions. Therefore, this paper presents a novel +MBRL method with a predictive individual world model (PIWM) for autonomous +driving. PIWM describes the driving environment from an individual-level +perspective and captures vehicles' interactive relations and their intentions +via trajectory prediction task. Meanwhile, a behavior policy is learned jointly +with PIWM. It is trained in PIWM's imagination and effectively navigates in the +urban driving scenes leveraging intention-aware latent states. The proposed +method is trained and evaluated on simulation environments built upon +real-world challenging interactive scenarios. Compared with popular model-free +and state-of-the-art model-based reinforcement learning methods, experimental +results show that the proposed method achieves the best performance in terms of +safety and efficiency. + +
+
+ comment: Codes: https://github.com/gaoyinfeng/PIWM +
+
+
+
+
+ + ☆ Optimizing Efficiency of Mixed Traffic through Reinforcement Learning: A + Topology-Independent Approach and Benchmark ICRA 2025 + + +
+ This paper presents a mixed traffic control policy designed to optimize +traffic efficiency across diverse road topologies, addressing issues of +congestion prevalent in urban environments. A model-free reinforcement learning +(RL) approach is developed to manage large-scale traffic flow, using data +collected by autonomous vehicles to influence human-driven vehicles. A +real-world mixed traffic control benchmark is also released, which includes 444 +scenarios from 20 countries, representing a wide geographic distribution and +covering a variety of scenarios and road topologies. This benchmark serves as a +foundation for future research, providing a realistic simulation environment +for the development of effective policies. Comprehensive experiments +demonstrate the effectiveness and adaptability of the proposed method, +achieving better performance than existing traffic control methods in both +intersection and roundabout scenarios. To the best of our knowledge, this is +the first project to introduce a real-world complex scenarios mixed traffic +control benchmark. Videos and code of our work are available at +https://sites.google.com/berkeley.edu/mixedtrafficplus/home + +
+
+ comment: accepted to ICRA 2025 +
+
+
+
+
+ + ☆ Safety-Critical Control for Aerial Physical Interaction in Uncertain + Environment ICRA + + +
+ Aerial manipulation for safe physical interaction with their environments is +gaining significant momentum in robotics research. In this paper, we present a +disturbance-observer-based safety-critical control for a fully actuated aerial +manipulator interacting with both static and dynamic structures. Our approach +centers on a safety filter that dynamically adjusts the desired trajectory of +the vehicle's pose, accounting for the aerial manipulator's dynamics, the +disturbance observer's structure, and motor thrust limits. We provide rigorous +proof that the proposed safety filter ensures the forward invariance of the +safety set - representing motor thrust limits - even in the presence of +disturbance estimation errors. To demonstrate the superiority of our method +over existing control strategies for aerial physical interaction, we perform +comparative experiments involving complex tasks, such as pushing against a +static structure and pulling a plug firmly attached to an electric socket. +Furthermore, to highlight its repeatability in scenarios with sudden dynamic +changes, we perform repeated tests of pushing a movable cart and extracting a +plug from a socket. These experiments confirm that our method not only +outperforms existing methods but also excels in handling tasks with rapid +dynamic variations. + +
+
+ comment: to be presented in 2025 IEEE International Conference on Robotics and + Automation (ICRA), Atlanta, USA, 2025 +
+
+
+
+
+ + ☆ Strawberry Robotic Operation Interface: An Open-Source Device for + Collecting Dexterous Manipulation Data in Robotic Strawberry Farming + + +
+ The strawberry farming is labor-intensive, particularly in tasks requiring +dexterous manipulation such as picking occluded strawberries. To address this +challenge, we present the Strawberry Robotic Operation Interface (SROI), an +open-source device designed for collecting dexterous manipulation data in +robotic strawberry farming. The SROI features a handheld unit with a modular +end effector, a stereo robotic camera, enabling the easy collection of +demonstration data in field environments. A data post-processing pipeline is +introduced to extract spatial trajectories and gripper states from the +collected data. Additionally, we release an open-source dataset of strawberry +picking demonstrations to facilitate research in dexterous robotic +manipulation. The SROI represents a step toward automating complex strawberry +farming tasks, reducing reliance on manual labor. + +
+
+
+
+
+ + ☆ 3D-MoE: A Mixture-of-Experts Multi-modal LLM for 3D Vision and Pose + Diffusion via Rectified Flow + + +
+ 3D vision and spatial reasoning have long been recognized as preferable for +accurately perceiving our three-dimensional world, especially when compared +with traditional visual reasoning based on 2D images. Due to the difficulties +in collecting high-quality 3D data, research in this area has only recently +gained momentum. With the advent of powerful large language models (LLMs), +multi-modal LLMs for 3D vision have been developed over the past few years. +However, most of these models focus primarily on the vision encoder for 3D +data. In this paper, we propose converting existing densely activated LLMs into +mixture-of-experts (MoE) models, which have proven effective for multi-modal +data processing. In addition to leveraging these models' instruction-following +capabilities, we further enable embodied task planning by attaching a diffusion +head, Pose-DiT, that employs a novel rectified flow diffusion scheduler. +Experimental results on 3D question answering and task-planning tasks +demonstrate that our 3D-MoE framework achieves improved performance with fewer +activated parameters. + +
+
+ comment: Preprint. Work in progress +
+
+
+
+
+ + ☆ Improving Vision-Language-Action Model with Online Reinforcement + Learning ICRA 2025 + + +
+ Recent studies have successfully integrated large vision-language models +(VLMs) into low-level robotic control by supervised fine-tuning (SFT) with +expert robotic datasets, resulting in what we term vision-language-action (VLA) +models. Although the VLA models are powerful, how to improve these large models +during interaction with environments remains an open question. In this paper, +we explore how to further improve these VLA models via Reinforcement Learning +(RL), a commonly used fine-tuning technique for large models. However, we find +that directly applying online RL to large VLA models presents significant +challenges, including training instability that severely impacts the +performance of large models, and computing burdens that exceed the capabilities +of most local machines. To address these challenges, we propose iRe-VLA +framework, which iterates between Reinforcement Learning and Supervised +Learning to effectively improve VLA models, leveraging the exploratory benefits +of RL while maintaining the stability of supervised learning. Experiments in +two simulated benchmarks and a real-world manipulation suite validate the +effectiveness of our method. + +
+
+ comment: Accepted to ICRA 2025 +
+
+
+
+
+ + ☆ Benchmarking Model Predictive Control and Reinforcement Learning Based + Control for Legged Robot Locomotion in MuJoCo Simulation + + +
+ Model Predictive Control (MPC) and Reinforcement Learning (RL) are two +prominent strategies for controlling legged robots, each with unique strengths. +RL learns control policies through system interaction, adapting to various +scenarios, whereas MPC relies on a predefined mathematical model to solve +optimization problems in real-time. Despite their widespread use, there is a +lack of direct comparative analysis under standardized conditions. This work +addresses this gap by benchmarking MPC and RL controllers on a Unitree Go1 +quadruped robot within the MuJoCo simulation environment, focusing on a +standardized task-straight walking at a constant velocity. Performance is +evaluated based on disturbance rejection, energy efficiency, and terrain +adaptability. The results show that RL excels in handling disturbances and +maintaining energy efficiency but struggles with generalization to new terrains +due to its dependence on learned policies tailored to specific environments. In +contrast, MPC shows enhanced recovery capabilities from larger perturbations by +leveraging its optimization-based approach, allowing for a balanced +distribution of control efforts across the robot's joints. The results provide +a clear understanding of the advantages and limitations of both RL and MPC, +offering insights into selecting an appropriate control strategy for legged +robotic applications. + +
+
+
+
+
+ + ♻ ☆ Decictor: Towards Evaluating the Robustness of Decision-Making in + Autonomous Driving Systems + + +
+ Autonomous Driving System (ADS) testing is crucial in ADS development, with +the current primary focus being on safety. However, the evaluation of +non-safety-critical performance, particularly the ADS's ability to make optimal +decisions and produce optimal paths for autonomous vehicles (AVs), is also +vital to ensure the intelligence and reduce risks of AVs. Currently, there is +little work dedicated to assessing the robustness of ADSs' path-planning +decisions (PPDs), i.e., whether an ADS can maintain the optimal PPD after an +insignificant change in the environment. The key challenges include the lack of +clear oracles for assessing PPD optimality and the difficulty in searching for +scenarios that lead to non-optimal PPDs. To fill this gap, in this paper, we +focus on evaluating the robustness of ADSs' PPDs and propose the first method, +Decictor, for generating non-optimal decision scenarios (NoDSs), where the ADS +does not plan optimal paths for AVs. Decictor comprises three main components: +Non-invasive Mutation, Consistency Check, and Feedback. To overcome the oracle +challenge, Non-invasive Mutation is devised to implement conservative +modifications, ensuring the preservation of the original optimal path in the +mutated scenarios. Subsequently, the Consistency Check is applied to determine +the presence of non-optimal PPDs by comparing the driving paths in the original +and mutated scenarios. To deal with the challenge of large environment space, +we design Feedback metrics that integrate spatial and temporal dimensions of +the AV's movement. These metrics are crucial for effectively steering the +generation of NoDSs. We evaluate Decictor on Baidu Apollo, an open-source and +production-grade ADS. The experimental results validate the effectiveness of +Decictor in detecting non-optimal PPDs of ADSs. + +
+
+
+
+
+ + ♻ ☆ PokeFlex: A Real-World Dataset of Volumetric Deformable Objects for + Robotics + + +
+ Data-driven methods have shown great potential in solving challenging +manipulation tasks; however, their application in the domain of deformable +objects has been constrained, in part, by the lack of data. To address this +lack, we propose PokeFlex, a dataset featuring real-world multimodal data that +is paired and annotated. The modalities include 3D textured meshes, point +clouds, RGB images, and depth maps. Such data can be leveraged for several +downstream tasks, such as online 3D mesh reconstruction, and it can potentially +enable underexplored applications such as the real-world deployment of +traditional control methods based on mesh simulations. To deal with the +challenges posed by real-world 3D mesh reconstruction, we leverage a +professional volumetric capture system that allows complete 360{\deg} +reconstruction. PokeFlex consists of 18 deformable objects with varying +stiffness and shapes. Deformations are generated by dropping objects onto a +flat surface or by poking the objects with a robot arm. Interaction wrenches +and contact locations are also reported for the latter case. Using different +data modalities, we demonstrated a use case for our dataset training models +that, given the novelty of the multimodal nature of Pokeflex, constitute the +state-of-the-art in multi-object online template-based mesh reconstruction from +multimodal data, to the best of our knowledge. We refer the reader to our +website ( https://pokeflex-dataset.github.io/ ) for further demos and examples. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ♻ ☆ Weakly-Supervised Learning via Multi-Lateral Decoder Branching for Tool + Segmentation in Robot-Assisted Cardiovascular Catheterization + + +
+ Robot-assisted catheterization has garnered a good attention for its +potentials in treating cardiovascular diseases. However, advancing +surgeon-robot collaboration still requires further research, particularly on +task-specific automation. For instance, automated tool segmentation can assist +surgeons in visualizing and tracking of endovascular tools during cardiac +procedures. While learning-based models have demonstrated state-of-the-art +segmentation performances, generating ground-truth labels for fully-supervised +methods is both labor-intensive time consuming, and costly. In this study, we +propose a weakly-supervised learning method with multi-lateral pseudo labeling +for tool segmentation in cardiovascular angiogram datasets. The method utilizes +a modified U-Net architecture featuring one encoder and multiple laterally +branched decoders. The decoders generate diverse pseudo labels under different +perturbations, augmenting available partial labels. The pseudo labels are +self-generated using a mixed loss function with shared consistency across the +decoders. The weakly-supervised model was trained end-to-end and validated +using partially annotated angiogram data from three cardiovascular +catheterization procedures. Validation results show that the model could +perform closer to fully-supervised models. Also, the proposed weakly-supervised +multi-lateral method outperforms three well known methods used for +weakly-supervised learning, offering the highest segmentation performance +across the three angiogram datasets. Furthermore, numerous ablation studies +confirmed the model's consistent performance under different parameters. +Finally, the model was applied for tool segmentation in a robot-assisted +catheterization experiments. The model enhanced visualization with high +connectivity indices for guidewire and catheter, and a mean processing time of +35 ms per frame. + +
+
+
+
+
+ + ♻ ☆ SpatialVLA: Exploring Spatial Representations for Visual-Language-Action + Model + + +
+ In this paper, we claim that spatial understanding is the keypoint in robot +manipulation, and propose SpatialVLA to explore effective spatial +representations for the robot foundation model. Specifically, we introduce +Ego3D Position Encoding to inject 3D information into the input observations of +the visual-language-action model, and propose Adaptive Action Grids to +represent spatial robot movement actions with adaptive discretized action +grids, facilitating learning generalizable and transferrable spatial action +knowledge for cross-robot control. SpatialVLA is first pre-trained on top of a +vision-language model with 1.1 Million real-world robot episodes, to learn a +generalist manipulation policy across multiple robot environments and tasks. +After pre-training, SpatialVLA is directly applied to perform numerous tasks in +a zero-shot manner. The superior results in both simulation and real-world +robots demonstrate its advantage of inferring complex robot motion trajectories +and its strong in-domain multi-task generalization ability. We further show the +proposed Adaptive Action Grids offer a new and effective way to fine-tune the +pre-trained SpatialVLA model for new simulation and real-world setups, where +the pre-learned action grids are re-discretized to capture robot-specific +spatial action movements of new setups. The superior results from extensive +evaluations demonstrate the exceptional in-distribution generalization and +out-of-distribution adaptation capability, highlighting the crucial benefit of +the proposed spatial-aware representations for generalist robot policy +learning. All the details and codes will be open-sourced. + +
+
+
+
+
+ + ♻ ☆ Front Hair Styling Robot System Using Path Planning for Root-Centric + Strand Adjustment + + +
+ Hair styling is a crucial aspect of personal grooming, significantly +influenced by the appearance of front hair. While brushing is commonly used +both to detangle hair and for styling purposes, existing research primarily +focuses on robotic systems for detangling hair, with limited exploration into +robotic hair styling. This research presents a novel robotic system designed to +automatically adjust front hairstyles, with an emphasis on path planning for +root-centric strand adjustment. The system utilizes images to compare the +current hair state with the desired target state through an orientation map of +hair strands. By concentrating on the differences in hair orientation and +specifically targeting adjustments at the root of each strand, the system +performs detailed styling tasks. The path planning approach ensures effective +alignment of the hairstyle with the target, and a closed-loop mechanism refines +these adjustments to accurately evolve the hairstyle towards the desired +outcome. Experimental results demonstrate that the proposed system achieves a +high degree of similarity and consistency in front hair styling, showing +promising results for automated, precise hairstyle adjustments. + +
+
+ comment: Accepted at IEEE/SICE SII2025 +
+
+
+
+
+ + ♻ ☆ Collision Avoidance and Geofencing for Fixed-wing Aircraft with Control + Barrier Functions + + +
+ Safety-critical failures often have fatal consequences in aerospace control. +Control systems on aircraft, therefore, must ensure the strict satisfaction of +safety constraints, preferably with formal guarantees of safe behavior. This +paper establishes the safety-critical control of fixed-wing aircraft in +collision avoidance and geofencing tasks. A control framework is developed +wherein a run-time assurance (RTA) system modulates the nominal flight +controller of the aircraft whenever necessary to prevent it from colliding with +other aircraft or crossing a boundary (geofence) in space. The RTA is +formulated as a safety filter using control barrier functions (CBFs) with +formal guarantees of safe behavior. CBFs are constructed and compared for a +nonlinear kinematic fixed-wing aircraft model. The proposed CBF-based +controllers showcase the capability of safely executing simultaneous collision +avoidance and geofencing, as demonstrated by simulations on the kinematic model +and a high-fidelity dynamical model. + +
+
+ comment: Accepted to the IEEE Transactions on Control System Technology. 15 + pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Mechanisms and Computational Design of Multi-Modal End-Effector with + Force Sensing using Gated Networks ICRA25 + + +
+ In limbed robotics, end-effectors must serve dual functions, such as both +feet for locomotion and grippers for grasping, which presents design +challenges. This paper introduces a multi-modal end-effector capable of +transitioning between flat and line foot configurations while providing +grasping capabilities. MAGPIE integrates 8-axis force sensing using proposed +mechanisms with hall effect sensors, enabling both contact and tactile force +measurements. We present a computational design framework for our sensing +mechanism that accounts for noise and interference, allowing for desired +sensitivity and force ranges and generating ideal inverse models. The hardware +implementation of MAGPIE is validated through experiments, demonstrating its +capability as a foot and verifying the performance of the sensing mechanisms, +ideal models, and gated network-based models. + +
+
+ comment: Proceeding to 2025 IEEE International Conference on Robotics and + Automation (ICRA25) +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 84 + +
+
+
+ + ☆ CubeDiff: Repurposing Diffusion-Based Image Models for Panorama + Generation ICLR 2025 + + +
+ We introduce a novel method for generating 360{\deg} panoramas from text +prompts or images. Our approach leverages recent advances in 3D generation by +employing multi-view diffusion models to jointly synthesize the six faces of a +cubemap. Unlike previous methods that rely on processing equirectangular +projections or autoregressive generation, our method treats each face as a +standard perspective image, simplifying the generation process and enabling the +use of existing multi-view diffusion models. We demonstrate that these models +can be adapted to produce high-quality cubemaps without requiring +correspondence-aware attention layers. Our model allows for fine-grained text +control, generates high resolution panorama images and generalizes well beyond +its training set, whilst achieving state-of-the-art results, both qualitatively +and quantitatively. Project page: https://cubediff.github.io/ + +
+
+ comment: Accepted at ICLR 2025 +
+
+
+
+
+ + ☆ SFT Memorizes, RL Generalizes: A Comparative Study of Foundation Model + Post-training + + +
+ Supervised fine-tuning (SFT) and reinforcement learning (RL) are widely used +post-training techniques for foundation models. However, their roles in +enhancing model generalization capabilities remain unclear. This paper studies +the difference between SFT and RL on generalization and memorization, focusing +on text-based rule variants and visual variants. We introduce GeneralPoints, an +arithmetic reasoning card game, and adopt V-IRL, a real-world navigation +environment, to assess how models trained with SFT and RL generalize to unseen +variants in both textual and visual domains. We show that RL, especially when +trained with an outcome-based reward, generalizes across both rule-based +textual and visual variants. SFT, in contrast, tends to memorize training data +and struggles to generalize out-of-distribution scenarios. Further analysis +reveals that RL improves the model's underlying visual recognition +capabilities, contributing to its enhanced generalization in the visual domain. +Despite RL's superior generalization, we show that SFT remains essential for +effective RL training; SFT stabilizes the model's output format, enabling +subsequent RL to achieve its performance gains. These findings demonstrates the +capability of RL for acquiring generalizable knowledge in complex, multi-modal +tasks. + +
+
+ comment: Website at https://tianzhechu.com/SFTvsRL +
+
+
+
+
+ + ☆ A Hybrid Deep Learning CNN Model for Enhanced COVID-19 Detection from + Computed Tomography (CT) Scan Images + + +
+ Early detection of COVID-19 is crucial for effective treatment and +controlling its spread. This study proposes a novel hybrid deep learning model +for detecting COVID-19 from CT scan images, designed to assist overburdened +medical professionals. Our proposed model leverages the strengths of VGG16, +DenseNet121, and MobileNetV2 to extract features, followed by Principal +Component Analysis (PCA) for dimensionality reduction, after which the features +are stacked and classified using a Support Vector Classifier (SVC). We +conducted comparative analysis between the proposed hybrid model and individual +pre-trained CNN models, using a dataset of 2,108 training images and 373 test +images comprising both COVID-positive and non-COVID images. Our proposed hybrid +model achieved an accuracy of 98.93%, outperforming the individual models in +terms of precision, recall, F1 scores, and ROC curve performance. + +
+
+ comment: Corresponding authors: Shanthi Karpurapu + (shanthi.karpurapu@gmail.com), Suresh Babu Nettur (nettursuresh@gmail.com) + Shanthi Karpurapu and Suresh Babu Nettur are co-first authors +
+
+
+
+
+ + ☆ IC-Portrait: In-Context Matching for View-Consistent Personalized + Portrait + + +
+ Existing diffusion models show great potential for identity-preserving +generation. However, personalized portrait generation remains challenging due +to the diversity in user profiles, including variations in appearance and +lighting conditions. To address these challenges, we propose IC-Portrait, a +novel framework designed to accurately encode individual identities for +personalized portrait generation. Our key insight is that pre-trained diffusion +models are fast learners (e.g.,100 ~ 200 steps) for in-context dense +correspondence matching, which motivates the two major designs of our +IC-Portrait framework. Specifically, we reformulate portrait generation into +two sub-tasks: 1) Lighting-Aware Stitching: we find that masking a high +proportion of the input image, e.g., 80%, yields a highly effective +self-supervisory representation learning of reference image lighting. 2) +View-Consistent Adaptation: we leverage a synthetic view-consistent profile +dataset to learn the in-context correspondence. The reference profile can then +be warped into arbitrary poses for strong spatial-aligned view conditioning. +Coupling these two designs by simply concatenating latents to form +ControlNet-like supervision and modeling, enables us to significantly enhance +the identity preservation fidelity and stability. Extensive evaluations +demonstrate that IC-Portrait consistently outperforms existing state-of-the-art +methods both quantitatively and qualitatively, with particularly notable +improvements in visual qualities. Furthermore, IC-Portrait even demonstrates +3D-aware relighting capabilities. + +
+
+ comment: technical report +
+
+
+
+
+ + ☆ Scenario Understanding of Traffic Scenes Through Large Visual Language + Models + + +
+ Deep learning models for autonomous driving, encompassing perception, +planning, and control, depend on vast datasets to achieve their high +performance. However, their generalization often suffers due to domain-specific +data distributions, making an effective scene-based categorization of samples +necessary to improve their reliability across diverse domains. Manual +captioning, though valuable, is both labor-intensive and time-consuming, +creating a bottleneck in the data annotation process. Large Visual Language +Models (LVLMs) present a compelling solution by automating image analysis and +categorization through contextual queries, often without requiring retraining +for new categories. In this study, we evaluate the capabilities of LVLMs, +including GPT-4 and LLaVA, to understand and classify urban traffic scenes on +both an in-house dataset and the BDD100K. We propose a scalable captioning +pipeline that integrates state-of-the-art models, enabling a flexible +deployment on new datasets. Our analysis, combining quantitative metrics with +qualitative insights, demonstrates the effectiveness of LVLMs to understand +urban traffic scenarios and highlights their potential as an efficient tool for +data-driven advancements in autonomous driving. + +
+
+ comment: Accepted at WACV2025 +
+
+
+
+
+ + ☆ Text-to-Image Generation for Vocabulary Learning Using the Keyword + Method + + +
+ The 'keyword method' is an effective technique for learning vocabulary of a +foreign language. It involves creating a memorable visual link between what a +word means and what its pronunciation in a foreign language sounds like in the +learner's native language. However, these memorable visual links remain +implicit in the people's mind and are not easy to remember for a large set of +words. To enhance the memorisation and recall of the vocabulary, we developed +an application that combines the keyword method with text-to-image generators +to externalise the memorable visual links into visuals. These visuals represent +additional stimuli during the memorisation process. To explore the +effectiveness of this approach we first run a pilot study to investigate how +difficult it is to externalise the descriptions of mental visualisations of +memorable links, by asking participants to write them down. We used these +descriptions as prompts for text-to-image generator (DALL-E2) to convert them +into images and asked participants to select their favourites. Next, we +compared different text-to-image generators (DALL-E2, Midjourney, Stable and +Latent Diffusion) to evaluate the perceived quality of the generated images by +each. Despite heterogeneous results, participants mostly preferred images +generated by DALL-E2, which was used also for the final study. In this study, +we investigated whether providing such images enhances the retention of +vocabulary being learned, compared to the keyword method only. Our results +indicate that people did not encounter difficulties describing their +visualisations of memorable links and that providing corresponding images +significantly improves memory retention. + +
+
+
+
+
+ + ☆ Evaluating CrowdSplat: Perceived Level of Detail for Gaussian Crowds + + +
+ Efficient and realistic crowd rendering is an important element of many +real-time graphics applications such as Virtual Reality (VR) and games. To this +end, Levels of Detail (LOD) avatar representations such as polygonal meshes, +image-based impostors, and point clouds have been proposed and evaluated. More +recently, 3D Gaussian Splatting has been explored as a potential method for +real-time crowd rendering. In this paper, we present a two-alternative forced +choice (2AFC) experiment that aims to determine the perceived quality of 3D +Gaussian avatars. Three factors were explored: Motion, LOD (i.e., #Gaussians), +and the avatar height in Pixels (corresponding to the viewing distance). +Participants viewed pairs of animated 3D Gaussian avatars and were tasked with +choosing the most detailed one. Our findings can inform the optimization of LOD +strategies in Gaussian-based crowd rendering, thereby helping to achieve +efficient rendering while maintaining visual quality in real-time applications. + +
+
+ comment: 5 pages, 5 figures +
+
+
+
+
+ + ☆ DINOSTAR: Deep Iterative Neural Object Detector Self-Supervised Training + for Roadside LiDAR Applications + + +
+ Recent advancements in deep-learning methods for object detection in +point-cloud data have enabled numerous roadside applications, fostering +improvements in transportation safety and management. However, the intricate +nature of point-cloud data poses significant challenges for human-supervised +labeling, resulting in substantial expenditures of time and capital. This paper +addresses the issue by developing an end-to-end, scalable, and self-supervised +framework for training deep object detectors tailored for roadside point-cloud +data. The proposed framework leverages self-supervised, statistically modeled +teachers to train off-the-shelf deep object detectors, thus circumventing the +need for human supervision. The teacher models follow fine-tuned set standard +practices of background filtering, object clustering, bounding-box fitting, and +classification to generate noisy labels. It is presented that by training the +student model over the combined noisy annotations from multitude of teachers +enhances its capacity to discern background/foreground more effectively and +forces it to learn diverse point-cloud-representations for object categories of +interest. The evaluations, involving publicly available roadside datasets and +state-of-art deep object detectors, demonstrate that the proposed framework +achieves comparable performance to deep object detectors trained on +human-annotated labels, despite not utilizing such human-annotations in its +training process. + +
+
+ comment: conference, 6 pages +
+
+
+
+
+ + ☆ EdgeMLOps: Operationalizing ML models with Cumulocity IoT and + thin-edge.io for Visual quality Inspection + + +
+ This paper introduces EdgeMLOps, a framework leveraging Cumulocity IoT and +thin-edge.io for deploying and managing machine learning models on +resource-constrained edge devices. We address the challenges of model +optimization, deployment, and lifecycle management in edge environments. The +framework's efficacy is demonstrated through a visual quality inspection (VQI) +use case where images of assets are processed on edge devices, enabling +real-time condition updates within an asset management system. Furthermore, we +evaluate the performance benefits of different quantization methods, +specifically static and dynamic signed-int8, on a Raspberry Pi 4, demonstrating +significant inference time reductions compared to FP32 precision. Our results +highlight the potential of EdgeMLOps to enable efficient and scalable AI +deployments at the edge for industrial applications. + +
+
+
+
+
+ + ☆ Contextual Self-paced Learning for Weakly Supervised Spatio-Temporal + Video Grounding ICLR'25 + + +
+ In this work, we focus on Weakly Supervised Spatio-Temporal Video Grounding +(WSTVG). It is a multimodal task aimed at localizing specific subjects +spatio-temporally based on textual queries without bounding box supervision. +Motivated by recent advancements in multi-modal foundation models for grounding +tasks, we first explore the potential of state-of-the-art object detection +models for WSTVG. Despite their robust zero-shot capabilities, our adaptation +reveals significant limitations, including inconsistent temporal predictions, +inadequate understanding of complex queries, and challenges in adapting to +difficult scenarios. We propose CoSPaL (Contextual Self-Paced Learning), a +novel approach which is designed to overcome these limitations. CoSPaL +integrates three core components: (1) Tubelet Phrase Grounding (TPG), which +introduces spatio-temporal prediction by linking textual queries to tubelets; +(2) Contextual Referral Grounding (CRG), which improves comprehension of +complex queries by extracting contextual information to refine object +identification over time; and (3) Self-Paced Scene Understanding (SPS), a +training paradigm that progressively increases task difficulty, enabling the +model to adapt to complex scenarios by transitioning from coarse to +fine-grained understanding. + +
+
+ comment: ICLR'25 Main Conference. Project Page: + https://akash2907.github.io/cospal_webpage +
+
+
+
+
+ + ☆ Synthesizing 3D Abstractions by Inverting Procedural Buildings with + Transformers + + +
+ We generate abstractions of buildings, reflecting the essential aspects of +their geometry and structure, by learning to invert procedural models. We first +build a dataset of abstract procedural building models paired with simulated +point clouds and then learn the inverse mapping through a transformer. Given a +point cloud, the trained transformer then infers the corresponding abstracted +building in terms of a programmatic language description. This approach +leverages expressive procedural models developed for gaming and animation, and +thereby retains desirable properties such as efficient rendering of the +inferred abstractions and strong priors for regularity and symmetry. Our +approach achieves good reconstruction accuracy in terms of geometry and +structure, as well as structurally consistent inpainting. + +
+
+ comment: 4 pages, 3 figures +
+
+
+
+
+ + ☆ MAUCell: An Adaptive Multi-Attention Framework for Video Frame + Prediction + + +
+ Temporal sequence modeling stands as the fundamental foundation for video +prediction systems and real-time forecasting operations as well as anomaly +detection applications. The achievement of accurate predictions through +efficient resource consumption remains an ongoing issue in contemporary +temporal sequence modeling. We introduce the Multi-Attention Unit (MAUCell) +which combines Generative Adversarial Networks (GANs) and spatio-temporal +attention mechanisms to improve video frame prediction capabilities. Our +approach implements three types of attention models to capture intricate motion +sequences. A dynamic combination of these attention outputs allows the model to +reach both advanced decision accuracy along with superior quality while +remaining computationally efficient. The integration of GAN elements makes +generated frames appear more true to life therefore the framework creates +output sequences which mimic real-world footage. The new design system +maintains equilibrium between temporal continuity and spatial accuracy to +deliver reliable video prediction. Through a comprehensive evaluation +methodology which merged the perceptual LPIPS measurement together with classic +tests MSE, MAE, SSIM and PSNR exhibited enhancing capabilities than +contemporary approaches based on direct benchmark tests of Moving MNIST, KTH +Action, and CASIA-B (Preprocessed) datasets. Our examination indicates that +MAUCell shows promise for operational time requirements. The research findings +demonstrate how GANs work best with attention mechanisms to create better +applications for predicting video sequences. + +
+
+ comment: This work has been submitted to the IJCAI 2025 Conference for review. + It contains: 11 pages, 4 figures, 7 tables, and 3 Algorithms +
+
+
+
+
+ + ☆ FedEFM: Federated Endovascular Foundation Model with Unseen Data ICRA 2025 + + +
+ In endovascular surgery, the precise identification of catheters and +guidewires in X-ray images is essential for reducing intervention risks. +However, accurately segmenting catheter and guidewire structures is challenging +due to the limited availability of labeled data. Foundation models offer a +promising solution by enabling the collection of similar domain data to train +models whose weights can be fine-tuned for downstream tasks. Nonetheless, +large-scale data collection for training is constrained by the necessity of +maintaining patient privacy. This paper proposes a new method to train a +foundation model in a decentralized federated learning setting for endovascular +intervention. To ensure the feasibility of the training, we tackle the unseen +data issue using differentiable Earth Mover's Distance within a knowledge +distillation framework. Once trained, our foundation model's weights provide +valuable initialization for downstream tasks, thereby enhancing task-specific +performance. Intensive experiments show that our approach achieves new +state-of-the-art results, contributing to advancements in endovascular +intervention and robotic-assisted endovascular surgery, while addressing the +critical issue of data sharing in the medical domain. + +
+
+ comment: 8 pages. Accepted to ICRA 2025 +
+
+
+
+
+ + ☆ Modulating CNN Features with Pre-Trained ViT Representations for + Open-Vocabulary Object Detection + + +
+ Owing to large-scale image-text contrastive training, pre-trained vision +language model (VLM) like CLIP shows superior open-vocabulary recognition +ability. Most existing open-vocabulary object detectors attempt to utilize the +pre-trained VLM to attain generative representation. F-ViT uses the pre-trained +visual encoder as the backbone network and freezes it during training. However, +the frozen backbone doesn't benefit from the labeled data to strengthen the +representation. Therefore, we propose a novel two-branch backbone network +design, named as ViT-Feature-Modulated Multi-Scale Convolutional network +(VMCNet). VMCNet consists of a trainable convolutional branch, a frozen +pre-trained ViT branch and a feature modulation module. The trainable CNN +branch could be optimized with labeled data while the frozen pre-trained ViT +branch could keep the representation ability derived from large-scale +pre-training. Then, the proposed feature modulation module could modulate the +multi-scale CNN features with the representations from ViT branch. With the +proposed mixed structure, detector is more likely to discover novel categories. +Evaluated on two popular benchmarks, our method boosts the detection +performance on novel category and outperforms the baseline. On OV-COCO, the +proposed method achieves 44.3 AP$_{50}^{\mathrm{novel}}$ with ViT-B/16 and 48.5 +AP$_{50}^{\mathrm{novel}}$ with ViT-L/14. On OV-LVIS, VMCNet with ViT-B/16 and +ViT-L/14 reaches 27.8 and 38.4 mAP$_{r}$. + +
+
+
+
+
+ + ☆ RODEO: Robust Outlier Detection via Exposing Adaptive + Out-of-Distribution Samples ICML + + +
+ In recent years, there have been significant improvements in various forms of +image outlier detection. However, outlier detection performance under +adversarial settings lags far behind that in standard settings. This is due to +the lack of effective exposure to adversarial scenarios during training, +especially on unseen outliers, leading to detection models failing to learn +robust features. To bridge this gap, we introduce RODEO, a data-centric +approach that generates effective outliers for robust outlier detection. More +specifically, we show that incorporating outlier exposure (OE) and adversarial +training can be an effective strategy for this purpose, as long as the exposed +training outliers meet certain characteristics, including diversity, and both +conceptual differentiability and analogy to the inlier samples. We leverage a +text-to-image model to achieve this goal. We demonstrate both quantitatively +and qualitatively that our adaptive OE method effectively generates ``diverse'' +and ``near-distribution'' outliers, leveraging information from both text and +image domains. Moreover, our experimental results show that utilizing our +synthesized outliers significantly enhances the performance of the outlier +detector, particularly in adversarial settings. + +
+
+ comment: Accepted at the Forty-First International Conference on Machine + Learning (ICML) 2024. The implementation of our work is available at: + \url{https://github.com/rohban-lab/RODEO} +
+
+
+
+
+ + ☆ What Really Matters for Learning-based LiDAR-Camera Calibration + + +
+ Calibration is an essential prerequisite for the accurate data fusion of +LiDAR and camera sensors. Traditional calibration techniques often require +specific targets or suitable scenes to obtain reliable 2D-3D correspondences. +To tackle the challenge of target-less and online calibration, deep neural +networks have been introduced to solve the problem in a data-driven manner. +While previous learning-based methods have achieved impressive performance on +specific datasets, they still struggle in complex real-world scenarios. Most +existing works focus on improving calibration accuracy but overlook the +underlying mechanisms. In this paper, we revisit the development of +learning-based LiDAR-Camera calibration and encourage the community to pay more +attention to the underlying principles to advance practical applications. We +systematically analyze the paradigm of mainstream learning-based methods, and +identify the critical limitations of regression-based methods with the widely +used data generation pipeline. Our findings reveal that most learning-based +methods inadvertently operate as retrieval networks, focusing more on +single-modality distributions rather than cross-modality correspondences. We +also investigate how the input data format and preprocessing operations impact +network performance and summarize the regression clues to inform further +improvements. + +
+
+
+
+
+ + ☆ Image-based Geo-localization for Robotics: Are Black-box Vision-Language + Models there yet? IROS 2025 + + +
+ The advances in Vision-Language models (VLMs) offer exciting opportunities +for robotic applications involving image geo-localization, the problem of +identifying the geo-coordinates of a place based on visual data only. Recent +research works have focused on using a VLM as embeddings extractor for +geo-localization, however, the most sophisticated VLMs may only be available as +black boxes that are accessible through an API, and come with a number of +limitations: there is no access to training data, model features and gradients; +retraining is not possible; the number of predictions may be limited by the +API; training on model outputs is often prohibited; and queries are open-ended. +The utilization of a VLM as a stand-alone, zero-shot geo-localization system +using a single text-based prompt is largely unexplored. To bridge this gap, +this paper undertakes the first systematic study, to the best of our knowledge, +to investigate the potential of some of the state-of-the-art VLMs as +stand-alone, zero-shot geo-localization systems in a black-box setting with +realistic constraints. We consider three main scenarios for this thorough +investigation: a) fixed text-based prompt; b) semantically-equivalent +text-based prompts; and c) semantically-equivalent query images. We also take +into account the auto-regressive and probabilistic generation process of the +VLMs when investigating their utility for geo-localization task by using model +consistency as a metric in addition to traditional accuracy. Our work provides +new insights in the capabilities of different VLMs for the above-mentioned +scenarios. + +
+
+ comment: Submitted to IROS 2025 +
+
+
+
+
+ + ☆ B-FPGM: Lightweight Face Detection via Bayesian-Optimized Soft FPGM + Pruning + + +
+ Face detection is a computer vision application that increasingly demands +lightweight models to facilitate deployment on devices with limited +computational resources. Neural network pruning is a promising technique that +can effectively reduce network size without significantly affecting +performance. In this work, we propose a novel face detection pruning pipeline +that leverages Filter Pruning via Geometric Median (FPGM) pruning, Soft Filter +Pruning (SFP) and Bayesian optimization in order to achieve a superior +trade-off between size and performance compared to existing approaches. FPGM +pruning is a structured pruning technique that allows pruning the least +significant filters in each layer, while SFP iteratively prunes the filters and +allows them to be updated in any subsequent training step. Bayesian +optimization is employed in order to optimize the pruning rates of each layer, +rather than relying on engineering expertise to determine the optimal pruning +rates for each layer. In our experiments across all three subsets of the WIDER +FACE dataset, our proposed approach B-FPGM consistently outperforms existing +ones in balancing model size and performance. All our experiments were applied +to EResFD, the currently smallest (in number of parameters) well-performing +face detector of the literature; a small ablation study with a second small +face detector, EXTD, is also reported. The source code and trained pruned face +detection models can be found at: https://github.com/IDTITI/B-FPGM. + +
+
+ comment: Accepted for publication, RWS Workshop @ IEEE/CVF Winter Conference + on Applications of Computer Vision (WACV 2025), Tucson, AZ, USA, Feb. 2025. + This is the authors' "accepted version" +
+
+
+
+
+ + ☆ Adversarial Masked Autoencoder Purifier with Defense Transferability + + +
+ The study of adversarial defense still struggles to combat with advanced +adversarial attacks. In contrast to most prior studies that rely on the +diffusion model for test-time defense to remarkably increase the inference +time, we propose Masked AutoEncoder Purifier (MAEP), which integrates Masked +AutoEncoder (MAE) into an adversarial purifier framework for test-time +purification. While MAEP achieves promising adversarial robustness, it +particularly features model defense transferability and attack generalization +without relying on using additional data that is different from the training +dataset. To our knowledge, MAEP is the first study of adversarial purifier +based on MAE. Extensive experimental results demonstrate that our method can +not only maintain clear accuracy with only a slight drop but also exhibit a +close gap between the clean and robust accuracy. Notably, MAEP trained on +CIFAR10 achieves state-of-the-art performance even when tested directly on +ImageNet, outperforming existing diffusion-based models trained specifically on +ImageNet. + +
+
+
+
+
+ + ☆ Frequency Matters: Explaining Biases of Face Recognition in the + Frequency Domain ECCV 2024 + + +
+ Face recognition (FR) models are vulnerable to performance variations across +demographic groups. The causes for these performance differences are unclear +due to the highly complex deep learning-based structure of face recognition +models. Several works aimed at exploring possible roots of gender and ethnicity +bias, identifying semantic reasons such as hairstyle, make-up, or facial hair +as possible sources. Motivated by recent discoveries of the importance of +frequency patterns in convolutional neural networks, we explain bias in face +recognition using state-of-the-art frequency-based explanations. Our extensive +results show that different frequencies are important to FR models depending on +the ethnicity of the samples. + +
+
+ comment: Accepted at xAI4Biometrics at ECCV 2024 +
+
+
+
+
+ + ☆ Extending Information Bottleneck Attribution to Video Sequences + + +
+ We introduce VIBA, a novel approach for explainable video classification by +adapting Information Bottlenecks for Attribution (IBA) to video sequences. +While most traditional explainability methods are designed for image models, +our IBA framework addresses the need for explainability in temporal models used +for video analysis. To demonstrate its effectiveness, we apply VIBA to video +deepfake detection, testing it on two architectures: the Xception model for +spatial features and a VGG11-based model for capturing motion dynamics through +optical flow. Using a custom dataset that reflects recent deepfake generation +techniques, we adapt IBA to create relevance and optical flow maps, visually +highlighting manipulated regions and motion inconsistencies. Our results show +that VIBA generates temporally and spatially consistent explanations, which +align closely with human annotations, thus providing interpretability for video +classification and particularly for deepfake detection. + +
+
+
+
+
+ + ☆ Ultra-high resolution multimodal MRI dense labelled holistic brain atlas + + +
+ In this paper, we introduce holiAtlas, a holistic, multimodal and +high-resolution human brain atlas. This atlas covers different levels of +details of the human brain anatomy, from the organ to the substructure level, +using a new dense labelled protocol generated from the fusion of multiple local +protocols at different scales. This atlas has been constructed averaging images +and segmentations of 75 healthy subjects from the Human Connectome Project +database. Specifically, MR images of T1, T2 and WMn (White Matter nulled) +contrasts at 0.125 $mm^{3}$ resolution that were nonlinearly registered and +averaged using symmetric group-wise normalisation to construct the atlas. At +the finest level, the holiAtlas protocol has 350 different labels derived from +10 different delineation protocols. These labels were grouped at different +scales to provide a holistic view of the brain at different levels in a +coherent and consistent manner. This multiscale and multimodal atlas can be +used for the development of new ultra-high resolution segmentation methods that +can potentially leverage the early detection of neurological disorders. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ☆ Experimenting with Affective Computing Models in Video Interviews with + Spanish-speaking Older Adults + + +
+ Understanding emotional signals in older adults is crucial for designing +virtual assistants that support their well-being. However, existing affective +computing models often face significant limitations: (1) limited availability +of datasets representing older adults, especially in non-English-speaking +populations, and (2) poor generalization of models trained on younger or +homogeneous demographics. To address these gaps, this study evaluates +state-of-the-art affective computing models -- including facial expression +recognition, text sentiment analysis, and smile detection -- using videos of +older adults interacting with either a person or a virtual avatar. As part of +this effort, we introduce a novel dataset featuring Spanish-speaking older +adults engaged in human-to-human video interviews. Through three comprehensive +analyses, we investigate (1) the alignment between human-annotated labels and +automatic model outputs, (2) the relationships between model outputs across +different modalities, and (3) individual variations in emotional signals. Using +both the Wizard of Oz (WoZ) dataset and our newly collected dataset, we uncover +limited agreement between human annotations and model predictions, weak +consistency across modalities, and significant variability among individuals. +These findings highlight the shortcomings of generalized emotion perception +models and emphasize the need of incorporating personal variability and +cultural nuances into future systems. + +
+
+
+
+
+ + ☆ Not Every Patch is Needed: Towards a More Efficient and Effective + Backbone for Video-based Person Re-identification + + +
+ This paper proposes a new effective and efficient plug-and-play backbone for +video-based person re-identification (ReID). Conventional video-based ReID +methods typically use CNN or transformer backbones to extract deep features for +every position in every sampled video frame. Here, we argue that this +exhaustive feature extraction could be unnecessary, since we find that +different frames in a ReID video often exhibit small differences and contain +many similar regions due to the relatively slight movements of human beings. +Inspired by this, a more selective, efficient paradigm is explored in this +paper. Specifically, we introduce a patch selection mechanism to reduce +computational cost by choosing only the crucial and non-repetitive patches for +feature extraction. Additionally, we present a novel network structure that +generates and utilizes pseudo frame global context to address the issue of +incomplete views resulting from sparse inputs. By incorporating these new +designs, our backbone can achieve both high performance and low computational +cost. Extensive experiments on multiple datasets show that our approach reduces +the computational cost by 74\% compared to ViT-B and 28\% compared to ResNet50, +while the accuracy is on par with ViT-B and outperforms ResNet50 significantly. + +
+
+ comment: IEEE TIP +
+
+
+
+
+ + ☆ RG-Attn: Radian Glue Attention for Multi-modality Multi-agent + Cooperative Perception + + +
+ Cooperative perception offers an optimal solution to overcome the perception +limitations of single-agent systems by leveraging Vehicle-to-Everything (V2X) +communication for data sharing and fusion across multiple agents. However, most +existing approaches focus on single-modality data exchange, limiting the +potential of both homogeneous and heterogeneous fusion across agents. This +overlooks the opportunity to utilize multi-modality data per agent, restricting +the system's performance. In the automotive industry, manufacturers adopt +diverse sensor configurations, resulting in heterogeneous combinations of +sensor modalities across agents. To harness the potential of every possible +data source for optimal performance, we design a robust LiDAR and camera +cross-modality fusion module, Radian-Glue-Attention (RG-Attn), applicable to +both intra-agent cross-modality fusion and inter-agent cross-modality fusion +scenarios, owing to the convenient coordinate conversion by transformation +matrix and the unified sampling/inversion mechanism. We also propose two +different architectures, named Paint-To-Puzzle (PTP) and +Co-Sketching-Co-Coloring (CoS-CoCo), for conducting cooperative perception. PTP +aims for maximum precision performance and achieves smaller data packet size by +limiting cross-agent fusion to a single instance, but requiring all +participants to be equipped with LiDAR. In contrast, CoS-CoCo supports agents +with any configuration-LiDAR-only, camera-only, or LiDAR-camera-both, +presenting more generalization ability. Our approach achieves state-of-the-art +(SOTA) performance on both real and simulated cooperative perception datasets. +The code will be released at GitHub in early 2025. + +
+
+
+
+
+ + ☆ Dynamic Hypergraph Representation for Bone Metastasis Cancer Analysis + + +
+ Bone metastasis analysis is a significant challenge in pathology and plays a +critical role in determining patient quality of life and treatment strategies. +The microenvironment and specific tissue structures are essential for +pathologists to predict the primary bone cancer origins and primary bone cancer +subtyping. By digitizing bone tissue sections into whole slide images (WSIs) +and leveraging deep learning to model slide embeddings, this analysis can be +enhanced. However, tumor metastasis involves complex multivariate interactions +with diverse bone tissue structures, which traditional WSI analysis methods +such as multiple instance learning (MIL) fail to capture. Moreover, graph +neural networks (GNNs), limited to modeling pairwise relationships, are hard to +represent high-order biological associations. To address these challenges, we +propose a dynamic hypergraph neural network (DyHG) that overcomes the edge +construction limitations of traditional graph representations by connecting +multiple nodes via hyperedges. A low-rank strategy is used to reduce the +complexity of parameters in learning hypergraph structures, while a +Gumbel-Softmax-based sampling strategy optimizes the patch distribution across +hyperedges. An MIL aggregator is then used to derive a graph-level embedding +for comprehensive WSI analysis. To evaluate the effectiveness of DyHG, we +construct two large-scale datasets for primary bone cancer origins and +subtyping classification based on real-world bone metastasis scenarios. +Extensive experiments demonstrate that DyHG significantly outperforms +state-of-the-art (SOTA) baselines, showcasing its ability to model complex +biological interactions and improve the accuracy of bone metastasis analysis. + +
+
+ comment: 12 pages,11 figures +
+
+
+
+
+ + ☆ Exploring the Role of Explicit Temporal Modeling in Multimodal Large + Language Models for Video Understanding + + +
+ Applying Multimodal Large Language Models (MLLMs) to video understanding +presents significant challenges due to the need to model temporal relations +across frames. Existing approaches adopt either implicit temporal modeling, +relying solely on the LLM decoder, or explicit temporal modeling, employing +auxiliary temporal encoders. To investigate this debate between the two +paradigms, we propose the Stackable Temporal Encoder (STE). STE enables +flexible explicit temporal modeling with adjustable temporal receptive fields +and token compression ratios. Using STE, we systematically compare implicit and +explicit temporal modeling across dimensions such as overall performance, token +compression effectiveness, and temporal-specific understanding. We also explore +STE's design considerations and broader impacts as a plug-in module and in +image modalities. Our findings emphasize the critical role of explicit temporal +modeling, providing actionable insights to advance video MLLMs. + +
+
+
+
+
+ + ☆ FlexMotion: Lightweight, Physics-Aware, and Controllable Human Motion + Generation + + +
+ Lightweight, controllable, and physically plausible human motion synthesis is +crucial for animation, virtual reality, robotics, and human-computer +interaction applications. Existing methods often compromise between +computational efficiency, physical realism, or spatial controllability. We +propose FlexMotion, a novel framework that leverages a computationally +lightweight diffusion model operating in the latent space, eliminating the need +for physics simulators and enabling fast and efficient training. FlexMotion +employs a multimodal pre-trained Transformer encoder-decoder, integrating joint +locations, contact forces, joint actuations and muscle activations to ensure +the physical plausibility of the generated motions. FlexMotion also introduces +a plug-and-play module, which adds spatial controllability over a range of +motion parameters (e.g., joint locations, joint actuations, contact forces, and +muscle activations). Our framework achieves realistic motion generation with +improved efficiency and control, setting a new benchmark for human motion +synthesis. We evaluate FlexMotion on extended datasets and demonstrate its +superior performance in terms of realism, physical plausibility, and +controllability. + +
+
+
+
+
+ + ☆ Beyond-Labels: Advancing Open-Vocabulary Segmentation With + Vision-Language Models + + +
+ Self-supervised learning can resolve numerous image or linguistic processing +problems when effectively trained. This study investigated simple yet efficient +methods for adaping previously learned foundation models for open-vocabulary +semantic segmentation tasks. Our research proposed "Beyond-Labels," a +lightweight transformer-based fusion module that uses a handful of image +segmentation data to fuse frozen image representations with language concepts. +Furthermore, we efficiently captured positional information in images using +Fourier embeddings, thus improving the generalization across various image +sizes. Extensive ablation tests were performed to investigate the important +components of our proposed method; when tested against the common benchmark +PASCAL-5i, it demonstrated superior performance despite being trained on frozen +image and language characteristics. + +
+
+
+
+
+ + ☆ Target-driven Self-Distillation for Partial Observed Trajectories + Forecasting + + +
+ Accurate prediction of future trajectories of traffic agents is essential for +ensuring safe autonomous driving. However, partially observed trajectories can +significantly degrade the performance of even state-of-the-art models. Previous +approaches often rely on knowledge distillation to transfer features from fully +observed trajectories to partially observed ones. This involves firstly +training a fully observed model and then using a distillation process to create +the final model. While effective, they require multi-stage training, making the +training process very expensive. Moreover, knowledge distillation can lead to a +performance degradation of the model. In this paper, we introduce a +Target-driven Self-Distillation method (TSD) for motion forecasting. Our method +leverages predicted accurate targets to guide the model in making predictions +under partial observation conditions. By employing self-distillation, the model +learns from the feature distributions of both fully observed and partially +observed trajectories during a single end-to-end training process. This +enhances the model's ability to predict motion accurately in both fully +observed and partially observed scenarios. We evaluate our method on multiple +datasets and state-of-the-art motion forecasting models. Extensive experimental +results demonstrate that our approach achieves significant performance +improvements in both settings. To facilitate further research, we will release +our code and model checkpoints. + +
+
+
+
+
+ + ☆ DiffSplat: Repurposing Image Diffusion Models for Scalable Gaussian + Splat Generation ICLR 2025 + + +
+ Recent advancements in 3D content generation from text or a single image +struggle with limited high-quality 3D datasets and inconsistency from 2D +multi-view generation. We introduce DiffSplat, a novel 3D generative framework +that natively generates 3D Gaussian splats by taming large-scale text-to-image +diffusion models. It differs from previous 3D generative models by effectively +utilizing web-scale 2D priors while maintaining 3D consistency in a unified +model. To bootstrap the training, a lightweight reconstruction model is +proposed to instantly produce multi-view Gaussian splat grids for scalable +dataset curation. In conjunction with the regular diffusion loss on these +grids, a 3D rendering loss is introduced to facilitate 3D coherence across +arbitrary views. The compatibility with image diffusion models enables seamless +adaptions of numerous techniques for image generation to the 3D realm. +Extensive experiments reveal the superiority of DiffSplat in text- and +image-conditioned generation tasks and downstream applications. Thorough +ablation studies validate the efficacy of each critical design choice and +provide insights into the underlying mechanism. + +
+
+ comment: Accepted to ICLR 2025; Project page: + https://chenguolin.github.io/projects/DiffSplat +
+
+
+
+
+ + ☆ AdaSemSeg: An Adaptive Few-shot Semantic Segmentation of Seismic Facies + + +
+ Automated interpretation of seismic images using deep learning methods is +challenging because of the limited availability of training data. Few-shot +learning is a suitable learning paradigm in such scenarios due to its ability +to adapt to a new task with limited supervision (small training budget). +Existing few-shot semantic segmentation (FSSS) methods fix the number of target +classes. Therefore, they do not support joint training on multiple datasets +varying in the number of classes. In the context of the interpretation of +seismic facies, fixing the number of target classes inhibits the generalization +capability of a model trained on one facies dataset to another, which is likely +to have a different number of facies. To address this shortcoming, we propose a +few-shot semantic segmentation method for interpreting seismic facies that can +adapt to the varying number of facies across the dataset, dubbed the AdaSemSeg. +In general, the backbone network of FSSS methods is initialized with the +statistics learned from the ImageNet dataset for better performance. The lack +of such a huge annotated dataset for seismic images motivates using a +self-supervised algorithm on seismic datasets to initialize the backbone +network. We have trained the AdaSemSeg on three public seismic facies datasets +with different numbers of facies and evaluated the proposed method on multiple +metrics. The performance of the AdaSemSeg on unseen datasets (not used in +training) is better than the prototype-based few-shot method and baselines. + +
+
+ comment: Under review at IEEE Transactions on Geoscience and Remote Sensing +
+
+
+
+
+ + ☆ ITVTON:Virtual Try-On Diffusion Transformer Model Based on Integrated + Image and Text + + +
+ Recent advancements in virtual fitting for characters and clothing have +leveraged diffusion models to improve the realism of garment fitting. However, +challenges remain in handling complex scenes and poses, which can result in +unnatural garment fitting and poorly rendered intricate patterns. In this work, +we introduce ITVTON, a novel method that enhances clothing-character +interactions by combining clothing and character images along spatial channels +as inputs, thereby improving fitting accuracy for the inpainting model. +Additionally, we incorporate integrated textual descriptions from multiple +images to boost the realism of the generated visual effects. To optimize +computational efficiency, we limit training to the attention parameters within +a single diffusion transformer (Single-DiT) block. To more rigorously address +the complexities of real-world scenarios, we curated training samples from the +IGPair dataset, thereby enhancing ITVTON's performance across diverse +environments. Extensive experiments demonstrate that ITVTON outperforms +baseline methods both qualitatively and quantitatively, setting a new standard +for virtual fitting tasks. + +
+
+
+
+
+ + ☆ SSF-PAN: Semantic Scene Flow-Based Perception for Autonomous Navigation + in Traffic Scenarios + + +
+ Vehicle detection and localization in complex traffic scenarios pose +significant challenges due to the interference of moving objects. Traditional +methods often rely on outlier exclusions or semantic segmentations, which +suffer from low computational efficiency and accuracy. The proposed SSF-PAN can +achieve the functionalities of LiDAR point cloud based object +detection/localization and SLAM (Simultaneous Localization and Mapping) with +high computational efficiency and accuracy, enabling map-free navigation +frameworks. The novelty of this work is threefold: 1) developing a neural +network which can achieve segmentation among static and dynamic objects within +the scene flows with different motion features, that is, semantic scene flow +(SSF); 2) developing an iterative framework which can further optimize the +quality of input scene flows and output segmentation results; 3) developing a +scene flow-based navigation platform which can test the performance of the SSF +perception system in the simulation environment. The proposed SSF-PAN method is +validated using the SUScape-CARLA and the KITTI datasets, as well as on the +CARLA simulator. Experimental results demonstrate that the proposed approach +outperforms traditional methods in terms of scene flow computation accuracy, +moving object detection accuracy, computational efficiency, and autonomous +navigation effectiveness. + +
+
+
+
+
+ + ☆ Overcoming Semantic Dilution in Transformer-Based Next Frame Prediction + + +
+ Next-frame prediction in videos is crucial for applications such as +autonomous driving, object tracking, and motion prediction. The primary +challenge in next-frame prediction lies in effectively capturing and processing +both spatial and temporal information from previous video sequences. The +transformer architecture, known for its prowess in handling sequence data, has +made remarkable progress in this domain. However, transformer-based next-frame +prediction models face notable issues: (a) The multi-head self-attention (MHSA) +mechanism requires the input embedding to be split into $N$ chunks, where $N$ +is the number of heads. Each segment captures only a fraction of the original +embeddings information, which distorts the representation of the embedding in +the latent space, resulting in a semantic dilution problem; (b) These models +predict the embeddings of the next frames rather than the frames themselves, +but the loss function based on the errors of the reconstructed frames, not the +predicted embeddings -- this creates a discrepancy between the training +objective and the model output. We propose a Semantic Concentration Multi-Head +Self-Attention (SCMHSA) architecture, which effectively mitigates semantic +dilution in transformer-based next-frame prediction. Additionally, we introduce +a loss function that optimizes SCMHSA in the latent space, aligning the +training objective more closely with the model output. Our method demonstrates +superior performance compared to the original transformer-based predictors. + +
+
+
+
+
+ + ☆ DebugAgent: Efficient and Interpretable Error Slice Discovery for + Comprehensive Model Debugging + + +
+ Despite the significant success of deep learning models in computer vision, +they often exhibit systematic failures on specific data subsets, known as error +slices. Identifying and mitigating these error slices is crucial to enhancing +model robustness and reliability in real-world scenarios. In this paper, we +introduce DebugAgent, an automated framework for error slice discovery and +model repair. DebugAgent first generates task-specific visual attributes to +highlight instances prone to errors through an interpretable and structured +process. It then employs an efficient slice enumeration algorithm to +systematically identify error slices, overcoming the combinatorial challenges +that arise during slice exploration. Additionally, DebugAgent extends its +capabilities by predicting error slices beyond the validation set, addressing a +key limitation of prior approaches. Extensive experiments across multiple +domains, including image classification, pose estimation, and object detection +- show that DebugAgent not only improves the coherence and precision of +identified error slices but also significantly enhances the model repair +capabilities. + +
+
+
+
+
+ + ☆ Efficient Knowledge Distillation of SAM for Medical Image Segmentation + + +
+ The Segment Anything Model (SAM) has set a new standard in interactive image +segmentation, offering robust performance across various tasks. However, its +significant computational requirements limit its deployment in real-time or +resource-constrained environments. To address these challenges, we propose a +novel knowledge distillation approach, KD SAM, which incorporates both encoder +and decoder optimization through a combination of Mean Squared Error (MSE) and +Perceptual Loss. This dual-loss framework captures structural and semantic +features, enabling the student model to maintain high segmentation accuracy +while reducing computational complexity. Based on the model evaluation on +datasets, including Kvasir-SEG, ISIC 2017, Fetal Head Ultrasound, and Breast +Ultrasound, we demonstrate that KD SAM achieves comparable or superior +performance to the baseline models, with significantly fewer parameters. KD SAM +effectively balances segmentation accuracy and computational efficiency, making +it well-suited for real-time medical image segmentation applications in +resource-constrained environments. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ☆ Consistency Diffusion Models for Single-Image 3D Reconstruction with + Priors + + +
+ This paper delves into the study of 3D point cloud reconstruction from a +single image. Our objective is to develop the Consistency Diffusion Model, +exploring synergistic 2D and 3D priors in the Bayesian framework to ensure +superior consistency in the reconstruction process, a challenging yet critical +requirement in this field. Specifically, we introduce a pioneering training +framework under diffusion models that brings two key innovations. First, we +convert 3D structural priors derived from the initial 3D point cloud as a bound +term to increase evidence in the variational Bayesian framework, leveraging +these robust intrinsic priors to tightly govern the diffusion training process +and bolster consistency in reconstruction. Second, we extract and incorporate +2D priors from the single input image, projecting them onto the 3D point cloud +to enrich the guidance for diffusion training. Our framework not only sidesteps +potential model learning shifts that may arise from directly imposing +additional constraints during training but also precisely transposes the 2D +priors into the 3D domain. Extensive experimental evaluations reveal that our +approach sets new benchmarks in both synthetic and real-world datasets. The +code is included with the submission. + +
+
+
+
+
+ + ☆ Dream to Drive with Predictive Individual World Model + + +
+ It is still a challenging topic to make reactive driving behaviors in complex +urban environments as road users' intentions are unknown. Model-based +reinforcement learning (MBRL) offers great potential to learn a reactive policy +by constructing a world model that can provide informative states and +imagination training. However, a critical limitation in relevant research lies +in the scene-level reconstruction representation learning, which may overlook +key interactive vehicles and hardly model the interactive features among +vehicles and their long-term intentions. Therefore, this paper presents a novel +MBRL method with a predictive individual world model (PIWM) for autonomous +driving. PIWM describes the driving environment from an individual-level +perspective and captures vehicles' interactive relations and their intentions +via trajectory prediction task. Meanwhile, a behavior policy is learned jointly +with PIWM. It is trained in PIWM's imagination and effectively navigates in the +urban driving scenes leveraging intention-aware latent states. The proposed +method is trained and evaluated on simulation environments built upon +real-world challenging interactive scenarios. Compared with popular model-free +and state-of-the-art model-based reinforcement learning methods, experimental +results show that the proposed method achieves the best performance in terms of +safety and efficiency. + +
+
+ comment: Codes: https://github.com/gaoyinfeng/PIWM +
+
+
+
+
+ + ☆ B-RIGHT: Benchmark Re-evaluation for Integrity in Generalized + Human-Object Interaction Testing + + +
+ Human-object interaction (HOI) is an essential problem in artificial +intelligence (AI) which aims to understand the visual world that involves +complex relationships between humans and objects. However, current benchmarks +such as HICO-DET face the following limitations: (1) severe class imbalance and +(2) varying number of train and test sets for certain classes. These issues can +potentially lead to either inflation or deflation of model performance during +evaluation, ultimately undermining the reliability of evaluation scores. In +this paper, we propose a systematic approach to develop a new class-balanced +dataset, Benchmark Re-evaluation for Integrity in Generalized Human-object +Interaction Testing (B-RIGHT), that addresses these imbalanced problems. +B-RIGHT achieves class balance by leveraging balancing algorithm and automated +generation-and-filtering processes, ensuring an equal number of instances for +each HOI class. Furthermore, we design a balanced zero-shot test set to +systematically evaluate models on unseen scenario. Re-evaluating existing +models using B-RIGHT reveals substantial the reduction of score variance and +changes in performance rankings compared to conventional HICO-DET. Our +experiments demonstrate that evaluation under balanced conditions ensure more +reliable and fair model comparisons. + +
+
+
+
+
+ + ☆ One Head Eight Arms: Block Matrix based Low Rank Adaptation for + CLIP-based Few-Shot Learning + + +
+ Recent advancements in fine-tuning Vision-Language Foundation Models (VLMs) +have garnered significant attention for their effectiveness in downstream +few-shot learning tasks.While these recent approaches exhibits some performance +improvements, they often suffer from excessive training parameters and high +computational costs. To address these challenges, we propose a novel Block +matrix-based low-rank adaptation framework, called Block-LoRA, for fine-tuning +VLMs on downstream few-shot tasks. Inspired by recent work on Low-Rank +Adaptation (LoRA), Block-LoRA partitions the original low-rank decomposition +matrix of LoRA into a series of sub-matrices while sharing all down-projection +sub-matrices. This structure not only reduces the number of training +parameters, but also transforms certain complex matrix multiplication +operations into simpler matrix addition, significantly lowering the +computational cost of fine-tuning. Notably, Block-LoRA enables fine-tuning CLIP +on the ImageNet few-shot benchmark using a single 24GB GPU. We also show that +Block-LoRA has the more tighter bound of generalization error than vanilla +LoRA. Without bells and whistles, extensive experiments demonstrate that +Block-LoRA achieves competitive performance compared to state-of-the-art +CLIP-based few-shot methods, while maintaining a low training parameters count +and reduced computational overhead. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Point Cloud Upsampling as Statistical Shape Model for Pelvic + + +
+ We propose a novel framework that integrates medical image segmentation and +point cloud upsampling for accurate shape reconstruction of pelvic models. +Using the SAM-Med3D model for segmentation and a point cloud upsampling network +trained on the MedShapeNet dataset, our method transforms sparse medical +imaging data into high-resolution 3D bone models. This framework leverages +prior knowledge of anatomical shapes, achieving smoother and more complete +reconstructions. Quantitative evaluations using metrics such as Chamfer +Distance etc, demonstrate the effectiveness of the point cloud upsampling in +pelvic model. Our approach offers potential applications in reconstructing +other skeletal structures, providing a robust solution for medical image +analysis and statistical shape modeling. + +
+
+ comment: 10 pages, 2 figures +
+
+
+
+
+ + ☆ Separate Motion from Appearance: Customizing Motion via Customizing + Text-to-Video Diffusion Models + + +
+ Motion customization aims to adapt the diffusion model (DM) to generate +videos with the motion specified by a set of video clips with the same motion +concept. To realize this goal, the adaptation of DM should be possible to model +the specified motion concept, without compromising the ability to generate +diverse appearances. Thus, the key to solving this problem lies in how to +separate the motion concept from the appearance in the adaptation process of +DM. Typical previous works explore different ways to represent and insert a +motion concept into large-scale pretrained text-to-video diffusion models, +e.g., learning a motion LoRA, using latent noise residuals, etc. While those +methods can encode the motion concept, they also inevitably encode the +appearance in the reference videos, resulting in weakened appearance generation +capability. In this paper, we follow the typical way to learn a motion LoRA to +encode the motion concept, but propose two novel strategies to enhance +motion-appearance separation, including temporal attention purification (TAP) +and appearance highway (AH). Specifically, we assume that in the temporal +attention module, the pretrained Value embeddings are sufficient to serve as +basic components needed by producing a new motion. Thus, in TAP, we choose only +to reshape the temporal attention with motion LoRAs so that Value embeddings +can be reorganized to produce a new motion. Further, in AH, we alter the +starting point of each skip connection in U-Net from the output of each +temporal attention module to the output of each spatial attention module. +Extensive experiments demonstrate that compared to previous works, our method +can generate videos with appearance more aligned with the text descriptions and +motion more consistent with the reference videos. + +
+
+ comment: 8 pages,6 figures +
+
+
+
+
+ + ☆ DFCon: Attention-Driven Supervised Contrastive Learning for Robust + Deepfake Detection + + +
+ This report presents our approach for the IEEE SP Cup 2025: Deepfake Face +Detection in the Wild (DFWild-Cup), focusing on detecting deepfakes across +diverse datasets. Our methodology employs advanced backbone models, including +MaxViT, CoAtNet, and EVA-02, fine-tuned using supervised contrastive loss to +enhance feature separation. These models were specifically chosen for their +complementary strengths. Integration of convolution layers and strided +attention in MaxViT is well-suited for detecting local features. In contrast, +hybrid use of convolution and attention mechanisms in CoAtNet effectively +captures multi-scale features. Robust pretraining with masked image modeling of +EVA-02 excels at capturing global features. After training, we freeze the +parameters of these models and train the classification heads. Finally, a +majority voting ensemble is employed to combine the predictions from these +models, improving robustness and generalization to unseen scenarios. The +proposed system addresses the challenges of detecting deepfakes in real-world +conditions and achieves a commendable accuracy of 95.83% on the validation +dataset. + +
+
+ comment: Technical report for IEEE Signal Processing Cup 2025, 7 pages +
+
+
+
+
+ + ☆ Determining Mosaic Resilience in Sugarcane Plants using Hyperspectral + Images + + +
+ Sugarcane mosaic disease poses a serious threat to the Australian sugarcane +industry, leading to yield losses of up to 30% in susceptible varieties. +Existing manual inspection methods for detecting mosaic resilience are +inefficient and impractical for large-scale application. This study introduces +a novel approach using hyperspectral imaging and machine learning to detect +mosaic resilience by leveraging global feature representation from local +spectral patches. Hyperspectral data were collected from eight sugarcane +varieties under controlled and field conditions. Local spectral patches were +analyzed to capture spatial and spectral variations, which were then aggregated +into global feature representations using a ResNet18 deep learning +architecture. While classical methods like Support Vector Machines struggled to +utilize spatial-spectral relationships effectively, the deep learning model +achieved high classification accuracy, demonstrating its capacity to identify +mosaic resilience from fine-grained hyperspectral data. This approach enhances +early detection capabilities, enabling more efficient management of susceptible +strains and contributing to sustainable sugarcane production. + +
+
+
+
+
+ + ☆ 3D-MoE: A Mixture-of-Experts Multi-modal LLM for 3D Vision and Pose + Diffusion via Rectified Flow + + +
+ 3D vision and spatial reasoning have long been recognized as preferable for +accurately perceiving our three-dimensional world, especially when compared +with traditional visual reasoning based on 2D images. Due to the difficulties +in collecting high-quality 3D data, research in this area has only recently +gained momentum. With the advent of powerful large language models (LLMs), +multi-modal LLMs for 3D vision have been developed over the past few years. +However, most of these models focus primarily on the vision encoder for 3D +data. In this paper, we propose converting existing densely activated LLMs into +mixture-of-experts (MoE) models, which have proven effective for multi-modal +data processing. In addition to leveraging these models' instruction-following +capabilities, we further enable embodied task planning by attaching a diffusion +head, Pose-DiT, that employs a novel rectified flow diffusion scheduler. +Experimental results on 3D question answering and task-planning tasks +demonstrate that our 3D-MoE framework achieves improved performance with fewer +activated parameters. + +
+
+ comment: Preprint. Work in progress +
+
+
+
+
+ + ☆ SliceOcc: Indoor 3D Semantic Occupancy Prediction with Vertical Slice + Representation ICRA 2025 + + +
+ 3D semantic occupancy prediction is a crucial task in visual perception, as +it requires the simultaneous comprehension of both scene geometry and +semantics. It plays a crucial role in understanding 3D scenes and has great +potential for various applications, such as robotic vision perception and +autonomous driving. Many existing works utilize planar-based representations +such as Bird's Eye View (BEV) and Tri-Perspective View (TPV). These +representations aim to simplify the complexity of 3D scenes while preserving +essential object information, thereby facilitating efficient scene +representation. However, in dense indoor environments with prevalent +occlusions, directly applying these planar-based methods often leads to +difficulties in capturing global semantic occupancy, ultimately degrading model +performance. In this paper, we present a new vertical slice representation that +divides the scene along the vertical axis and projects spatial point features +onto the nearest pair of parallel planes. To utilize these slice features, we +propose SliceOcc, an RGB camera-based model specifically tailored for indoor 3D +semantic occupancy prediction. SliceOcc utilizes pairs of slice queries and +cross-attention mechanisms to extract planar features from input images. These +local planar features are then fused to form a global scene representation, +which is employed for indoor occupancy prediction. Experimental results on the +EmbodiedScan dataset demonstrate that SliceOcc achieves a mIoU of 15.45% across +81 indoor categories, setting a new state-of-the-art performance among RGB +camera-based models for indoor 3D semantic occupancy prediction. Code is +available at https://github.com/NorthSummer/SliceOcc. + +
+
+ comment: Accepted by ICRA 2025; +
+
+
+
+
+ + ☆ Polyp-Gen: Realistic and Diverse Polyp Image Generation for Endoscopic + Dataset Expansion ICRA 2025 + + +
+ Automated diagnostic systems (ADS) have shown significant potential in the +early detection of polyps during endoscopic examinations, thereby reducing the +incidence of colorectal cancer. However, due to high annotation costs and +strict privacy concerns, acquiring high-quality endoscopic images poses a +considerable challenge in the development of ADS. Despite recent advancements +in generating synthetic images for dataset expansion, existing endoscopic image +generation algorithms failed to accurately generate the details of polyp +boundary regions and typically required medical priors to specify plausible +locations and shapes of polyps, which limited the realism and diversity of the +generated images. To address these limitations, we present Polyp-Gen, the first +full-automatic diffusion-based endoscopic image generation framework. +Specifically, we devise a spatial-aware diffusion training scheme with a +lesion-guided loss to enhance the structural context of polyp boundary regions. +Moreover, to capture medical priors for the localization of potential polyp +areas, we introduce a hierarchical retrieval-based sampling strategy to match +similar fine-grained spatial features. In this way, our Polyp-Gen can generate +realistic and diverse endoscopic images for building reliable ADS. Extensive +experiments demonstrate the state-of-the-art generation quality, and the +synthetic images can improve the downstream polyp detection task. Additionally, +our Polyp-Gen has shown remarkable zero-shot generalizability on other +datasets. The source code is available at +https://github.com/CUHK-AIM-Group/Polyp-Gen. + +
+
+ comment: Accepted by ICRA 2025 +
+
+
+
+
+ + ☆ Improving Interpretability and Accuracy in Neuro-Symbolic Rule + Extraction Using Class-Specific Sparse Filters + + +
+ There has been significant focus on creating neuro-symbolic models for +interpretable image classification using Convolutional Neural Networks (CNNs). +These methods aim to replace the CNN with a neuro-symbolic model consisting of +the CNN, which is used as a feature extractor, and an interpretable rule-set +extracted from the CNN itself. While these approaches provide interpretability +through the extracted rule-set, they often compromise accuracy compared to the +original CNN model. In this paper, we identify the root cause of this accuracy +loss as the post-training binarization of filter activations to extract the +rule-set. To address this, we propose a novel sparsity loss function that +enables class-specific filter binarization during CNN training, thus minimizing +information loss when extracting the rule-set. We evaluate several training +strategies with our novel sparsity loss, analyzing their effectiveness and +providing guidance on their appropriate use. Notably, we set a new benchmark, +achieving a 9% improvement in accuracy and a 53% reduction in rule-set size on +average, compared to the previous SOTA, while coming within 3% of the original +CNN's accuracy. This highlights the significant potential of interpretable +neuro-symbolic models as viable alternatives to black-box CNNs. + +
+
+
+
+
+ + ☆ CSPCL: Category Semantic Prior Contrastive Learning for Deformable + DETR-Based Prohibited Item Detectors + + +
+ Prohibited item detection based on X-ray images is one of the most effective +security inspection methods. However, the foreground-background feature +coupling caused by the overlapping phenomenon specific to X-ray images makes +general detectors designed for natural images perform poorly. To address this +issue, we propose a Category Semantic Prior Contrastive Learning (CSPCL) +mechanism, which aligns the class prototypes perceived by the classifier with +the content queries to correct and supplement the missing semantic information +responsible for classification, thereby enhancing the model sensitivity to +foreground features.To achieve this alignment, we design a specific contrastive +loss, CSP loss, which includes Intra-Class Truncated Attraction (ITA) loss and +Inter-Class Adaptive Repulsion (IAR) loss, and outperforms classic N-pair loss +and InfoNCE loss. Specifically, ITA loss leverages class prototypes to attract +intra-class category-specific content queries while preserving necessary +distinctiveness. IAR loss utilizes class prototypes to adaptively repel +inter-class category-specific content queries based on the similarity between +class prototypes, helping disentangle features of similar categories.CSPCL is +general and can be easily integrated into Deformable DETR-based models. +Extensive experiments on the PIXray and OPIXray datasets demonstrate that CSPCL +significantly enhances the performance of various state-of-the-art models +without increasing complexity.The code will be open source once the paper is +accepted. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Improving Vision-Language-Action Model with Online Reinforcement + Learning ICRA 2025 + + +
+ Recent studies have successfully integrated large vision-language models +(VLMs) into low-level robotic control by supervised fine-tuning (SFT) with +expert robotic datasets, resulting in what we term vision-language-action (VLA) +models. Although the VLA models are powerful, how to improve these large models +during interaction with environments remains an open question. In this paper, +we explore how to further improve these VLA models via Reinforcement Learning +(RL), a commonly used fine-tuning technique for large models. However, we find +that directly applying online RL to large VLA models presents significant +challenges, including training instability that severely impacts the +performance of large models, and computing burdens that exceed the capabilities +of most local machines. To address these challenges, we propose iRe-VLA +framework, which iterates between Reinforcement Learning and Supervised +Learning to effectively improve VLA models, leveraging the exploratory benefits +of RL while maintaining the stability of supervised learning. Experiments in +two simulated benchmarks and a real-world manipulation suite validate the +effectiveness of our method. + +
+
+ comment: Accepted to ICRA 2025 +
+
+
+
+
+ + ☆ Vision-based autonomous structural damage detection using data-driven + methods + + +
+ This study addresses the urgent need for efficient and accurate damage +detection in wind turbine structures, a crucial component of renewable energy +infrastructure. Traditional inspection methods, such as manual assessments and +non-destructive testing (NDT), are often costly, time-consuming, and prone to +human error. To tackle these challenges, this research investigates advanced +deep learning algorithms for vision-based structural health monitoring (SHM). A +dataset of wind turbine surface images, featuring various damage types and +pollution, was prepared and augmented for enhanced model training. Three +algorithms-YOLOv7, its lightweight variant, and Faster R-CNN- were employed to +detect and classify surface damage. The models were trained and evaluated on a +dataset split into training, testing, and evaluation subsets (80%-10%-10%). +Results indicate that YOLOv7 outperformed the others, achieving 82.4% mAP@50 +and high processing speed, making it suitable for real-time inspections. By +optimizing hyperparameters like learning rate and batch size, the models' +accuracy and efficiency improved further. YOLOv7 demonstrated significant +advancements in detection precision and execution speed, especially for +real-time applications. However, challenges such as dataset limitations and +environmental variability were noted, suggesting future work on segmentation +methods and larger datasets. This research underscores the potential of +vision-based deep learning techniques to transform SHM practices by reducing +costs, enhancing safety, and improving reliability, thus contributing to the +sustainable maintenance of critical infrastructure and supporting the longevity +of wind energy systems. + +
+
+ comment: 14 pages, 8 figures. This study examines advanced deep learning + algorithms, specifically YOLOv7, for efficient and accurate damage detection + in wind turbine structures. It significantly enhances detection precision and + speed for real-time inspections +
+
+
+
+
+ + ☆ Molecular-driven Foundation Model for Oncologic Pathology + + +
+ Foundation models are reshaping computational pathology by enabling transfer +learning, where models pre-trained on vast datasets can be adapted for +downstream diagnostic, prognostic, and therapeutic response tasks. Despite +these advances, foundation models are still limited in their ability to encode +the entire gigapixel whole-slide images without additional training and often +lack complementary multimodal data. Here, we introduce Threads, a slide-level +foundation model capable of generating universal representations of whole-slide +images of any size. Threads was pre-trained using a multimodal learning +approach on a diverse cohort of 47,171 hematoxylin and eosin (H&E)-stained +tissue sections, paired with corresponding genomic and transcriptomic profiles +- the largest such paired dataset to be used for foundation model development +to date. This unique training paradigm enables Threads to capture the tissue's +underlying molecular composition, yielding powerful representations applicable +to a wide array of downstream tasks. In extensive benchmarking across 54 +oncology tasks, including clinical subtyping, grading, mutation prediction, +immunohistochemistry status determination, treatment response prediction, and +survival prediction, Threads outperformed all baselines while demonstrating +remarkable generalizability and label efficiency. It is particularly well +suited for predicting rare events, further emphasizing its clinical utility. We +intend to make the model publicly available for the broader community. + +
+
+
+
+
+ + ☆ CHiP: Cross-modal Hierarchical Direct Preference Optimization for + Multimodal LLMs ICLR 2025 + + +
+ Multimodal Large Language Models (MLLMs) still struggle with hallucinations +despite their impressive capabilities. Recent studies have attempted to +mitigate this by applying Direct Preference Optimization (DPO) to multimodal +scenarios using preference pairs from text-based responses. However, our +analysis of representation distributions reveals that multimodal DPO struggles +to align image and text representations and to distinguish between hallucinated +and non-hallucinated descriptions. To address these challenges, in this work, +we propose a Cross-modal Hierarchical Direct Preference Optimization (CHiP) to +address these limitations. We introduce a visual preference optimization module +within the DPO framework, enabling MLLMs to learn from both textual and visual +preferences simultaneously. Furthermore, we propose a hierarchical textual +preference optimization module that allows the model to capture preferences at +multiple granular levels, including response, segment, and token levels. We +evaluate CHiP through both quantitative and qualitative analyses, with results +across multiple benchmarks demonstrating its effectiveness in reducing +hallucinations. On the Object HalBench dataset, CHiP outperforms DPO in +hallucination reduction, achieving improvements of 52.7% and 55.5% relative +points based on the base model Muffin and LLaVA models, respectively. We make +all our datasets and code publicly available: https://github.com/LVUGAI/CHiP. + +
+
+ comment: Accepted by ICLR 2025 +
+
+
+
+
+ + ☆ Predicting 3D representations for Dynamic Scenes + + +
+ We present a novel framework for dynamic radiance field prediction given +monocular video streams. Unlike previous methods that primarily focus on +predicting future frames, our method goes a step further by generating explicit +3D representations of the dynamic scene. The framework builds on two core +designs. First, we adopt an ego-centric unbounded triplane to explicitly +represent the dynamic physical world. Second, we develop a 4D-aware transformer +to aggregate features from monocular videos to update the triplane. Coupling +these two designs enables us to train the proposed model with large-scale +monocular videos in a self-supervised manner. Our model achieves top results in +dynamic radiance field prediction on NVIDIA dynamic scenes, demonstrating its +strong performance on 4D physical world modeling. Besides, our model shows a +superior generalizability to unseen scenarios. Notably, we find that our +approach emerges capabilities for geometry and semantic learning. + +
+
+
+
+
+ + ☆ CascadeV: An Implementation of Wurstchen Architecture for Video + Generation + + +
+ Recently, with the tremendous success of diffusion models in the field of +text-to-image (T2I) generation, increasing attention has been directed toward +their potential in text-to-video (T2V) applications. However, the computational +demands of diffusion models pose significant challenges, particularly in +generating high-resolution videos with high frame rates. In this paper, we +propose CascadeV, a cascaded latent diffusion model (LDM), that is capable of +producing state-of-the-art 2K resolution videos. Experiments demonstrate that +our cascaded model achieves a higher compression ratio, substantially reducing +the computational challenges associated with high-quality video generation. We +also implement a spatiotemporal alternating grid 3D attention mechanism, which +effectively integrates spatial and temporal information, ensuring superior +consistency across the generated video frames. Furthermore, our model can be +cascaded with existing T2V models, theoretically enabling a 4$\times$ increase +in resolution or frames per second without any fine-tuning. Our code is +available at https://github.com/bytedance/CascadeV. + +
+
+
+
+
+ + ☆ Unsupervised Domain Adaptation with Dynamic Clustering and Contrastive + Refinement for Gait Recognition + + +
+ Gait recognition is an emerging identification technology that distinguishes +individuals at long distances by analyzing individual walking patterns. +Traditional techniques rely heavily on large-scale labeled datasets, which +incurs high costs and significant labeling challenges. Recently, researchers +have explored unsupervised gait recognition with clustering-based unsupervised +domain adaptation methods and achieved notable success. However, these methods +directly use pseudo-label generated by clustering and neglect pseudolabel noise +caused by domain differences, which affects the effect of the model training +process. To mitigate these issues, we proposed a novel model called GaitDCCR, +which aims to reduce the influence of noisy pseudo labels on clustering and +model training. Our approach can be divided into two main stages: clustering +and training stage. In the clustering stage, we propose Dynamic Cluster +Parameters (DCP) and Dynamic Weight Centroids (DWC) to improve the efficiency +of clustering and obtain reliable cluster centroids. In the training stage, we +employ the classical teacher-student structure and propose Confidence-based +Pseudo-label Refinement (CPR) and Contrastive Teacher Module (CTM) to encourage +noisy samples to converge towards clusters containing their true identities. +Extensive experiments on public gait datasets have demonstrated that our simple +and effective method significantly enhances the performance of unsupervised +gait recognition, laying the foundation for its application in the +real-world.The code is available at https://github.com/YanSun-github/GaitDCCR + +
+
+ comment: 21 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ LUDVIG: Learning-free Uplifting of 2D Visual features to Gaussian + Splatting scenes + + +
+ We address the problem of extending the capabilities of vision foundation +models such as DINO, SAM, and CLIP, to 3D tasks. Specifically, we introduce a +novel method to uplift 2D image features into Gaussian Splatting +representations of 3D scenes. Unlike traditional approaches that rely on +minimizing a reconstruction loss, our method employs a simpler and more +efficient feature aggregation technique, augmented by a graph diffusion +mechanism. Graph diffusion refines 3D features, such as coarse segmentation +masks, by leveraging 3D geometry and pairwise similarities induced by DINOv2. +Our approach achieves performance comparable to the state of the art on +multiple downstream tasks while delivering significant speed-ups. Notably, we +obtain competitive segmentation results using generic DINOv2 features, despite +DINOv2 not being trained on millions of annotated segmentation masks like SAM. +When applied to CLIP features, our method demonstrates strong performance in +open-vocabulary object localization tasks, highlighting the versatility of our +approach. + +
+
+ comment: Project page: https://juliettemarrie.github.io/ludvig +
+
+
+
+
+ + ♻ ☆ NeRAF: 3D Scene Infused Neural Radiance and Acoustic Fields + + +
+ Sound plays a major role in human perception. Along with vision, it provides +essential information for understanding our surroundings. Despite advances in +neural implicit representations, learning acoustics that align with visual +scenes remains a challenge. We propose NeRAF, a method that jointly learns +acoustic and radiance fields. NeRAF synthesizes both novel views and +spatialized room impulse responses (RIR) at new positions by conditioning the +acoustic field on 3D scene geometric and appearance priors from the radiance +field. The generated RIR can be applied to auralize any audio signal. Each +modality can be rendered independently and at spatially distinct positions, +offering greater versatility. We demonstrate that NeRAF generates high-quality +audio on SoundSpaces and RAF datasets, achieving significant performance +improvements over prior methods while being more data-efficient. Additionally, +NeRAF enhances novel view synthesis of complex scenes trained with sparse data +through cross-modal learning. NeRAF is designed as a Nerfstudio module, +providing convenient access to realistic audio-visual generation. + +
+
+ comment: Project Page: https://amandinebtto.github.io/NeRAF +
+
+
+
+
+ + ♻ ☆ Distilling foundation models for robust and efficient models in digital + pathology + + +
+ In recent years, the advent of foundation models (FM) for digital pathology +has relied heavily on scaling the pre-training datasets and the model size, +yielding large and powerful models. While it resulted in improving the +performance on diverse downstream tasks, it also introduced increased +computational cost and inference time. In this work, we explore the +distillation of a large foundation model into a smaller one, reducing the +number of parameters by several orders of magnitude. Leveraging distillation +techniques, our distilled model, H0-mini, achieves nearly comparable +performance to large FMs at a significantly reduced inference cost. It is +evaluated on several public benchmarks, achieving 3rd place on the HEST +benchmark and 5th place on the EVA benchmark. Additionally, a robustness +analysis conducted on the PLISM dataset demonstrates that our distilled model +reaches excellent robustness to variations in staining and scanning conditions, +significantly outperforming other state-of-the art models. This opens new +perspectives to design lightweight and robust models for digital pathology, +without compromising on performance. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Mobile-Agent-E: Self-Evolving Mobile Assistant for Complex Tasks + + +
+ Smartphones have become indispensable in modern life, yet navigating complex +tasks on mobile devices often remains frustrating. Recent advancements in large +multimodal model (LMM)-based mobile agents have demonstrated the ability to +perceive and act in mobile environments. However, current approaches face +significant limitations: they fall short in addressing real-world human needs, +struggle with reasoning-intensive and long-horizon tasks, and lack mechanisms +to learn and improve from prior experiences. To overcome these challenges, we +introduce Mobile-Agent-E, a hierarchical multi-agent framework capable of +self-evolution through past experience. By hierarchical, we mean an explicit +separation of high-level planning and low-level action execution. The framework +comprises a Manager, responsible for devising overall plans by breaking down +complex tasks into subgoals, and four subordinate agents--Perceptor, Operator, +Action Reflector, and Notetaker--which handle fine-grained visual perception, +immediate action execution, error verification, and information aggregation, +respectively. Mobile-Agent-E also features a novel self-evolution module which +maintains a persistent long-term memory comprising Tips and Shortcuts. Tips are +general guidance and lessons learned from prior tasks on how to effectively +interact with the environment. Shortcuts are reusable, executable sequences of +atomic operations tailored for specific subroutines. The inclusion of Tips and +Shortcuts facilitates continuous refinement in performance and efficiency. +Alongside this framework, we introduce Mobile-Eval-E, a new benchmark featuring +complex mobile tasks requiring long-horizon, multi-app interactions. Empirical +results show that Mobile-Agent-E achieves a 22% absolute improvement over +previous state-of-the-art approaches across three foundation model backbones. +Project page: https://x-plug.github.io/MobileAgent. + +
+
+
+
+
+ + ♻ ☆ PokeFlex: A Real-World Dataset of Volumetric Deformable Objects for + Robotics + + +
+ Data-driven methods have shown great potential in solving challenging +manipulation tasks; however, their application in the domain of deformable +objects has been constrained, in part, by the lack of data. To address this +lack, we propose PokeFlex, a dataset featuring real-world multimodal data that +is paired and annotated. The modalities include 3D textured meshes, point +clouds, RGB images, and depth maps. Such data can be leveraged for several +downstream tasks, such as online 3D mesh reconstruction, and it can potentially +enable underexplored applications such as the real-world deployment of +traditional control methods based on mesh simulations. To deal with the +challenges posed by real-world 3D mesh reconstruction, we leverage a +professional volumetric capture system that allows complete 360{\deg} +reconstruction. PokeFlex consists of 18 deformable objects with varying +stiffness and shapes. Deformations are generated by dropping objects onto a +flat surface or by poking the objects with a robot arm. Interaction wrenches +and contact locations are also reported for the latter case. Using different +data modalities, we demonstrated a use case for our dataset training models +that, given the novelty of the multimodal nature of Pokeflex, constitute the +state-of-the-art in multi-object online template-based mesh reconstruction from +multimodal data, to the best of our knowledge. We refer the reader to our +website ( https://pokeflex-dataset.github.io/ ) for further demos and examples. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ♻ ☆ Acquiring Submillimeter-Accurate Multi-Task Vision Datasets for + Computer-Assisted Orthopedic Surgery + + +
+ Advances in computer vision, particularly in optical image-based 3D +reconstruction and feature matching, enable applications like marker-less +surgical navigation and digitization of surgery. However, their development is +hindered by a lack of suitable datasets with 3D ground truth. This work +explores an approach to generating realistic and accurate ex vivo datasets +tailored for 3D reconstruction and feature matching in open orthopedic surgery. +A set of posed images and an accurately registered ground truth surface mesh of +the scene are required to develop vision-based 3D reconstruction and matching +methods suitable for surgery. We propose a framework consisting of three core +steps and compare different methods for each step: 3D scanning, calibration of +viewpoints for a set of high-resolution RGB images, and an optical-based method +for scene registration. We evaluate each step of this framework on an ex vivo +scoliosis surgery using a pig spine, conducted under real operating room +conditions. A mean 3D Euclidean error of 0.35 mm is achieved with respect to +the 3D ground truth. The proposed method results in submillimeter accurate 3D +ground truths and surgical images with a spatial resolution of 0.1 mm. This +opens the door to acquiring future surgical datasets for high-precision +applications. + +
+
+ comment: 18 pages, 12 figures. Submitted to the 16th International Conference + on Information Processing in Computer-Assisted Interventions (IPCAI 2025) +
+
+
+
+
+ + ♻ ☆ Uni-Renderer: Unifying Rendering and Inverse Rendering Via Dual Stream + Diffusion + + +
+ Rendering and inverse rendering are pivotal tasks in both computer vision and +graphics. The rendering equation is the core of the two tasks, as an ideal +conditional distribution transfer function from intrinsic properties to RGB +images. Despite achieving promising results of existing rendering methods, they +merely approximate the ideal estimation for a specific scene and come with a +high computational cost. Additionally, the inverse conditional distribution +transfer is intractable due to the inherent ambiguity. To address these +challenges, we propose a data-driven method that jointly models rendering and +inverse rendering as two conditional generation tasks within a single diffusion +framework. Inspired by UniDiffuser, we utilize two distinct time schedules to +model both tasks, and with a tailored dual streaming module, we achieve +cross-conditioning of two pre-trained diffusion models. This unified approach, +named Uni-Renderer, allows the two processes to facilitate each other through a +cycle-consistent constrain, mitigating ambiguity by enforcing consistency +between intrinsic properties and rendered images. Combined with a meticulously +prepared dataset, our method effectively decomposition of intrinsic properties +and demonstrates a strong capability to recognize changes during rendering. We +will open-source our training and inference code to the public, fostering +further research and development in this area. + +
+
+
+
+
+ + ♻ ☆ The Hatching-Box: A Novel System for Automated Monitoring and + Quantification of Drosophila melanogaster Developmental Behavior + + +
+ In this paper we propose the Hatching-Box, a novel imaging and analysis +system to automatically monitor and quantify the developmental behavior of +Drosophila in standard rearing vials and during regular rearing routines, +rendering explicit experiments obsolete. This is achieved by combining custom +tailored imaging hardware with dedicated detection and tracking algorithms, +enabling the quantification of larvae, filled/empty pupae and flies over +multiple days. Given the affordable and reproducible design of the Hatching-Box +in combination with our generic client/server-based software, the system can +easily be scaled to monitor an arbitrary amount of rearing vials +simultaneously. We evaluated our system on a curated image dataset comprising +nearly 470,000 annotated objects and performed several studies on real world +experiments. We successfully reproduced results from well-established circadian +experiments by comparing the eclosion periods of wild type flies to the clock +mutants $\textit{per}^{short}$, $\textit{per}^{long}$ and $\textit{per}^0$ +without involvement of any manual labor. Furthermore we show, that the +Hatching-Box is able to extract additional information about group behavior as +well as to reconstruct the whole life-cycle of the individual specimens. These +results not only demonstrate the applicability of our system for long-term +experiments but also indicate its benefits for automated monitoring in the +general cultivation process. + +
+
+ comment: 17 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Steerable Conditional Diffusion for Out-of-Distribution Adaptation in + Medical Image Reconstruction + + +
+ Denoising diffusion models have emerged as the go-to generative framework for +solving inverse problems in imaging. A critical concern regarding these models +is their performance on out-of-distribution tasks, which remains an +under-explored challenge. Using a diffusion model on an out-of-distribution +dataset, realistic reconstructions can be generated, but with hallucinating +image features that are uniquely present in the training dataset. To address +this discrepancy during train-test time and improve reconstruction accuracy, we +introduce a novel sampling framework called Steerable Conditional Diffusion. +Specifically, this framework adapts the diffusion model, concurrently with +image reconstruction, based solely on the information provided by the available +measurement. Utilising our proposed method, we achieve substantial enhancements +in out-of-distribution performance across diverse imaging modalities, advancing +the robust deployment of denoising diffusion models in real-world applications. + +
+
+
+
+
+ + ♻ ☆ LinPrim: Linear Primitives for Differentiable Volumetric Rendering + + +
+ Volumetric rendering has become central to modern novel view synthesis +methods, which use differentiable rendering to optimize 3D scene +representations directly from observed views. While many recent works build on +NeRF or 3D Gaussians, we explore an alternative volumetric scene +representation. More specifically, we introduce two new scene representations +based on linear primitives-octahedra and tetrahedra-both of which define +homogeneous volumes bounded by triangular faces. This formulation aligns +naturally with standard mesh-based tools, minimizing overhead for downstream +applications. To optimize these primitives, we present a differentiable +rasterizer that runs efficiently on GPUs, allowing end-to-end gradient-based +optimization while maintaining realtime rendering capabilities. Through +experiments on real-world datasets, we demonstrate comparable performance to +state-of-the-art volumetric methods while requiring fewer primitives to achieve +similar reconstruction fidelity. Our findings provide insights into the +geometry of volumetric rendering and suggest that adopting explicit polyhedra +can expand the design space of scene representations. + +
+
+ comment: Project page: https://nicolasvonluetzow.github.io/LinPrim ; Project + video: https://youtu.be/P2yeHwmGaeM +
+
+
+
+
+ + ♻ ☆ StableMaterials: Enhancing Diversity in Material Generation via + Semi-Supervised Learning + + +
+ We introduce StableMaterials, a novel approach for generating photorealistic +physical-based rendering (PBR) materials that integrate semi-supervised +learning with Latent Diffusion Models (LDMs). Our method employs adversarial +training to distill knowledge from existing large-scale image generation +models, minimizing the reliance on annotated data and enhancing the diversity +in generation. This distillation approach aligns the distribution of the +generated materials with that of image textures from an SDXL model, enabling +the generation of novel materials that are not present in the initial training +dataset. Furthermore, we employ a diffusion-based refiner model to improve the +visual quality of the samples and achieve high-resolution generation. Finally, +we distill a latent consistency model for fast generation in just four steps +and propose a new tileability technique that removes visual artifacts typically +associated with fewer diffusion steps. We detail the architecture and training +process of StableMaterials, the integration of semi-supervised training within +existing LDM frameworks and show the advantages of our approach. Comparative +evaluations with state-of-the-art methods show the effectiveness of +StableMaterials, highlighting its potential applications in computer graphics +and beyond. StableMaterials is publicly available at +https://gvecchio.com/stablematerials. + +
+
+
+
+
+ + ♻ ☆ VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video + Understanding + + +
+ In this paper, we propose VideoLLaMA3, a more advanced multimodal foundation +model for image and video understanding. The core design philosophy of +VideoLLaMA3 is vision-centric. The meaning of "vision-centric" is two-fold: the +vision-centric training paradigm and vision-centric framework design. The key +insight of our vision-centric training paradigm is that high-quality image-text +data is crucial for both image and video understanding. Instead of preparing +massive video-text datasets, we focus on constructing large-scale and +high-quality image-text datasets. VideoLLaMA3 has four training stages: 1) +Vision Encoder Adaptation, which enables vision encoder to accept images of +variable resolutions as input; 2) Vision-Language Alignment, which jointly +tunes the vision encoder, projector, and LLM with large-scale image-text data +covering multiple types (including scene images, documents, charts) as well as +text-only data. 3) Multi-task Fine-tuning, which incorporates image-text SFT +data for downstream tasks and video-text data to establish a foundation for +video understanding. 4) Video-centric Fine-tuning, which further improves the +model's capability in video understanding. As for the framework design, to +better capture fine-grained details in images, the pretrained vision encoder is +adapted to encode images of varying sizes into vision tokens with corresponding +numbers, rather than a fixed number of tokens. For video inputs, we reduce the +number of vision tokens according to their similarity so that the +representation of videos will be more precise and compact. Benefit from +vision-centric designs, VideoLLaMA3 achieves compelling performances in both +image and video understanding benchmarks. + +
+
+ comment: BZ, KL, ZC, ZH, YY, GC, SL, YJ, HZ, and XL contributed equally to + this project. Code: https://github.com/DAMO-NLP-SG/VideoLLaMA3 +
+
+
+
+
+ + ♻ ☆ GFE-Mamba: Mamba-based AD Multi-modal Progression Assessment via + Generative Feature Extraction from MCI + + +
+ Alzheimer's Disease (AD) is a progressive, irreversible neurodegenerative +disorder that often originates from Mild Cognitive Impairment (MCI). This +progression results in significant memory loss and severely affects patients' +quality of life. Clinical trials have consistently shown that early and +targeted interventions for individuals with MCI may slow or even prevent the +advancement of AD. Research indicates that accurate medical classification +requires diverse multimodal data, including detailed assessment scales and +neuroimaging techniques like Magnetic Resonance Imaging (MRI) and Positron +Emission Tomography (PET). However, simultaneously collecting the +aforementioned three modalities for training presents substantial challenges. +To tackle these difficulties, we propose GFE-Mamba, a multimodal classifier +founded on Generative Feature Extractor. The intermediate features provided by +this Extractor can compensate for the shortcomings of PET and achieve profound +multimodal fusion in the classifier. The Mamba block, as the backbone of the +classifier, enables it to efficiently extract information from long-sequence +scale information. Pixel-level Bi-cross Attention supplements pixel-level +information from MRI and PET. We provide our rationale for developing this +cross-temporal progression prediction dataset and the pre-trained Extractor +weights. Our experimental findings reveal that the GFE-Mamba model effectively +predicts the progression from MCI to AD and surpasses several leading methods +in the field. Our source code is available at +https://github.com/Tinysqua/GFE-Mamba. + +
+
+ comment: 13 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Uni-Sign: Toward Unified Sign Language Understanding at Scale ICLR 2025 + + +
+ Sign language pre-training has gained increasing attention for its ability to +enhance performance across various sign language understanding (SLU) tasks. +However, existing methods often suffer from a gap between pre-training and +fine-tuning, leading to suboptimal results. To address this, we propose +Uni-Sign, a unified pre-training framework that eliminates the gap between +pre-training and downstream SLU tasks through a large-scale generative +pre-training strategy and a novel fine-tuning paradigm. First, we introduce +CSL-News, a large-scale Chinese Sign Language (CSL) dataset containing 1,985 +hours of video paired with textual annotations, which enables effective +large-scale pre-training. Second, Uni-Sign unifies SLU tasks by treating +downstream tasks as a single sign language translation (SLT) task during +fine-tuning, ensuring seamless knowledge transfer between pre-training and +fine-tuning. Furthermore, we incorporate a prior-guided fusion (PGF) module and +a score-aware sampling strategy to efficiently fuse pose and RGB information, +addressing keypoint inaccuracies and improving computational efficiency. +Extensive experiments across multiple SLU benchmarks demonstrate that Uni-Sign +achieves state-of-the-art performance across multiple downstream SLU tasks. +Dataset and code are available at github.com/ZechengLi19/Uni-Sign. + +
+
+ comment: Accepted by ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Weakly-Supervised Learning via Multi-Lateral Decoder Branching for Tool + Segmentation in Robot-Assisted Cardiovascular Catheterization + + +
+ Robot-assisted catheterization has garnered a good attention for its +potentials in treating cardiovascular diseases. However, advancing +surgeon-robot collaboration still requires further research, particularly on +task-specific automation. For instance, automated tool segmentation can assist +surgeons in visualizing and tracking of endovascular tools during cardiac +procedures. While learning-based models have demonstrated state-of-the-art +segmentation performances, generating ground-truth labels for fully-supervised +methods is both labor-intensive time consuming, and costly. In this study, we +propose a weakly-supervised learning method with multi-lateral pseudo labeling +for tool segmentation in cardiovascular angiogram datasets. The method utilizes +a modified U-Net architecture featuring one encoder and multiple laterally +branched decoders. The decoders generate diverse pseudo labels under different +perturbations, augmenting available partial labels. The pseudo labels are +self-generated using a mixed loss function with shared consistency across the +decoders. The weakly-supervised model was trained end-to-end and validated +using partially annotated angiogram data from three cardiovascular +catheterization procedures. Validation results show that the model could +perform closer to fully-supervised models. Also, the proposed weakly-supervised +multi-lateral method outperforms three well known methods used for +weakly-supervised learning, offering the highest segmentation performance +across the three angiogram datasets. Furthermore, numerous ablation studies +confirmed the model's consistent performance under different parameters. +Finally, the model was applied for tool segmentation in a robot-assisted +catheterization experiments. The model enhanced visualization with high +connectivity indices for guidewire and catheter, and a mean processing time of +35 ms per frame. + +
+
+
+
+
+ + ♻ ☆ Audio-Visual Deepfake Detection With Local Temporal Inconsistencies + + +
+ This paper proposes an audio-visual deepfake detection approach that aims to +capture fine-grained temporal inconsistencies between audio and visual +modalities. To achieve this, both architectural and data synthesis strategies +are introduced. From an architectural perspective, a temporal distance map, +coupled with an attention mechanism, is designed to capture these +inconsistencies while minimizing the impact of irrelevant temporal +subsequences. Moreover, we explore novel pseudo-fake generation techniques to +synthesize local inconsistencies. Our approach is evaluated against +state-of-the-art methods using the DFDC and FakeAVCeleb datasets, demonstrating +its effectiveness in detecting audio-visual deepfakes. + +
+
+ comment: Accepted in ICASSP 2025 +
+
+
+
+
+ + ♻ ☆ Slot-Guided Adaptation of Pre-trained Diffusion Models for + Object-Centric Learning and Compositional Generation ICLR2025 + + +
+ We present SlotAdapt, an object-centric learning method that combines slot +attention with pretrained diffusion models by introducing adapters for +slot-based conditioning. Our method preserves the generative power of +pretrained diffusion models, while avoiding their text-centric conditioning +bias. We also incorporate an additional guidance loss into our architecture to +align cross-attention from adapter layers with slot attention. This enhances +the alignment of our model with the objects in the input image without using +external supervision. Experimental results show that our method outperforms +state-of-the-art techniques in object discovery and image generation tasks +across multiple datasets, including those with real images. Furthermore, we +demonstrate through experiments that our method performs remarkably well on +complex real-world images for compositional generation, in contrast to other +slot-based generative methods in the literature. The project page can be found +at https://kaanakan.github.io/SlotAdapt/. + +
+
+ comment: Accepted to ICLR2025. Project page: + https://kaanakan.github.io/SlotAdapt/ +
+
+
+
+
+ + ♻ ☆ Conterfactual Generative Zero-Shot Semantic Segmentation + + +
+ zero-shot learning is an essential part of computer vision. As a classical +downstream task, zero-shot semantic segmentation has been studied because of +its applicant value. One of the popular zero-shot semantic segmentation methods +is based on the generative model Most new proposed works added structures on +the same architecture to enhance this model. However, we found that, from the +view of causal inference, the result of the original model has been influenced +by spurious statistical relationships. Thus the performance of the prediction +shows severe bias. In this work, we consider counterfactual methods to avoid +the confounder in the original model. Based on this method, we proposed a new +framework for zero-shot semantic segmentation. Our model is compared with +baseline models on two real-world datasets, Pascal-VOC and Pascal-Context. The +experiment results show proposed models can surpass previous confounded models +and can still make use of additional structures to improve the performance. We +also design a simple structure based on Graph Convolutional Networks (GCN) in +this work. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Proto-OOD: Enhancing OOD Object Detection with Prototype Feature + Similarity + + +
+ Neural networks that are trained on limited category samples often mispredict +out-of-distribution (OOD) objects. We observe that features of the same +category are more tightly clustered in feature space, while those of different +categories are more dispersed. Based on this, we propose using prototype +similarity for OOD detection. Drawing on widely used prototype features in +few-shot learning, we introduce a novel OOD detection network structure +(Proto-OOD). Proto-OOD enhances the representativeness of category prototypes +using contrastive loss and detects OOD data by evaluating the similarity +between input features and category prototypes. During training, Proto-OOD +generates OOD samples for training the similarity module with a negative +embedding generator. When Pascal VOC are used as the in-distribution dataset +and MS-COCO as the OOD dataset, Proto-OOD significantly reduces the FPR (false +positive rate). Moreover, considering the limitations of existing evaluation +metrics, we propose a more reasonable evaluation protocol. The code will be +released. + +
+
+
+
+
+ + ♻ ☆ Efficiency Bottlenecks of Convolutional Kolmogorov-Arnold Networks: A + Comprehensive Scrutiny with ImageNet, AlexNet, LeNet and Tabular + Classification + + +
+ Algorithmic level developments like Convolutional Neural Networks, +transformers, attention mechanism, Retrieval Augmented Generation and so on +have changed Artificial Intelligence. Recent such development was observed by +Kolmogorov-Arnold Networks that suggested to challenge the fundamental concept +of a Neural Network, thus change Multilayer Perceptron, and Convolutional +Neural Networks. They received a good reception in terms of scientific +modeling, yet had some drawbacks in terms of efficiency. In this paper, we +train Convolutional Kolmogorov Arnold Networks (CKANs) with the ImageNet-1k +dataset with 1.3 million images, MNIST dataset with 60k images and a tabular +biological science related MoA dataset and test the promise of CKANs in terms +of FLOPS, Inference Time, number of trainable parameters and training time +against the accuracy, precision, recall and f-1 score they produce against the +standard industry practice on CNN models. We show that the CKANs perform fair +yet slower than CNNs in small size dataset like MoA and MNIST but are not +nearly comparable as the dataset gets larger and more complex like the +ImageNet. The code implementation of this paper can be found on the link: +\href{https://github.com/ashimdahal/Study-of-Convolutional-Kolmogorov-Arnold-networks}{https://github.com/ashimdahal/Study-of-Convolutional-Kolmogorov-Arnold-networks} + +
+
+
+
+
+ + ♻ ☆ Cauchy activation function and XNet + + +
+ We have developed a novel activation function, named the Cauchy Activation +Function. This function is derived from the Cauchy Integral Theorem in complex +analysis and is specifically tailored for problems requiring high precision. +This innovation has led to the creation of a new class of neural networks, +which we call (Comple)XNet, or simply XNet. We will demonstrate that XNet is +particularly effective for high-dimensional challenges such as image +classification and solving Partial Differential Equations (PDEs). Our +evaluations show that XNet significantly outperforms established benchmarks +like MNIST and CIFAR-10 in computer vision, and offers substantial advantages +over Physics-Informed Neural Networks (PINNs) in both low-dimensional and +high-dimensional PDE scenarios. + +
+
+
+
+
+ + ♻ ☆ SPECIAL: Zero-shot Hyperspectral Image Classification With CLIP + + +
+ Hyperspectral image (HSI) classification aims at categorizing each pixel in +an HSI into a specific land cover class, which is crucial for applications like +remote sensing, environmental monitoring, and agriculture. Although deep +learning-based HSI classification methods have achieved significant +advancements, existing methods still rely on manually labeled data for +training, which is both time-consuming and labor-intensive. To address this +limitation, we introduce a novel zero-shot hyperspectral image classification +framework based on CLIP (SPECIAL), aiming to eliminate the need for manual +annotations. The SPECIAL framework consists of two main stages: (1) CLIP-based +pseudo-label generation, and (2) noisy label learning. In the first stage, HSI +is spectrally interpolated to produce RGB bands. These bands are subsequently +classified using CLIP, resulting in noisy pseudo-labels that are accompanied by +confidence scores. To improve the quality of these labels, we propose a scaling +strategy that fuses predictions from multiple spatial scales. In the second +stage, spectral information and a label refinement technique are incorporated +to mitigate label noise and further enhance classification accuracy. +Experimental results on three benchmark datasets demonstrate that our SPECIAL +outperforms existing methods in zero-shot HSI classification, showing its +potential for more practical applications. The code is available at +https://github.com/LiPang/SPECIAL. + +
+
+
+
+
+ + ♻ ☆ SpikSSD: Better Extraction and Fusion for Object Detection with Spiking + Neuron Networks + + +
+ As the third generation of neural networks, Spiking Neural Networks (SNNs) +have gained widespread attention due to their low energy consumption and +biological interpretability. Recently, SNNs have made considerable advancements +in computer vision. However, efficiently conducting feature extraction and +fusion under the spiking characteristics of SNNs for object detection remains a +pressing challenge. To address this problem, we propose the SpikSSD, a novel +Spiking Single Shot Multibox Detector. Specifically, we design a full-spiking +backbone network, MDS-ResNet, which effectively adjusts the membrane synaptic +input distribution at each layer, achieving better spiking feature extraction. +Additionally, for spiking feature fusion, we introduce the Spiking Bi-direction +Fusion Module (SBFM), which for the first time realizes bi-direction fusion of +spiking features, enhancing the multi-scale detection capability of the model. +Experimental results show that SpikSSD achieves 40.8% mAP on the GEN1 dataset, +76.3% and 52.4% mAP@0.5 on VOC 2007 and COCO 2017 datasets respectively with +the lowest firing rate, outperforming existing SNN-based approaches at ultralow +energy consumption. This work sets a new benchmark for future research in +SNN-based object detection. Our code is publicly available in +https://github.com/yimeng-fan/SpikSSD. + +
+
+
+
+
+ + ♻ ☆ GLDiTalker: Speech-Driven 3D Facial Animation with Graph Latent + Diffusion Transformer + + +
+ Speech-driven talking head generation is a critical yet challenging task with +applications in augmented reality and virtual human modeling. While recent +approaches using autoregressive and diffusion-based models have achieved +notable progress, they often suffer from modality inconsistencies, particularly +misalignment between audio and mesh, leading to reduced motion diversity and +lip-sync accuracy. To address this, we propose GLDiTalker, a novel +speech-driven 3D facial animation model based on a Graph Latent Diffusion +Transformer. GLDiTalker resolves modality misalignment by diffusing signals +within a quantized spatiotemporal latent space. It employs a two-stage training +pipeline: the Graph-Enhanced Quantized Space Learning Stage ensures lip-sync +accuracy, while the Space-Time Powered Latent Diffusion Stage enhances motion +diversity. Together, these stages enable GLDiTalker to generate realistic, +temporally stable 3D facial animations. Extensive evaluations on standard +benchmarks demonstrate that GLDiTalker outperforms existing methods, achieving +superior results in both lip-sync accuracy and motion diversity. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ EmoFace: Emotion-Content Disentangled Speech-Driven 3D Talking Face + Animation + + +
+ The creation of increasingly vivid 3D talking face has become a hot topic in +recent years. Currently, most speech-driven works focus on lip synchronisation +but neglect to effectively capture the correlations between emotions and facial +motions. To address this problem, we propose a two-stream network called +EmoFace, which consists of an emotion branch and a content branch. EmoFace +employs a novel Mesh Attention mechanism to analyse and fuse the emotion +features and content features. Particularly, a newly designed spatio-temporal +graph-based convolution, SpiralConv3D, is used in Mesh Attention to learn +potential temporal and spatial feature dependencies between mesh vertices. In +addition, to the best of our knowledge, it is the first time to introduce a new +self-growing training scheme with intermediate supervision to dynamically +adjust the ratio of groundtruth adopted in the 3D face animation task. +Comprehensive quantitative and qualitative evaluations on our high-quality 3D +emotional facial animation dataset, 3D-RAVDESS ($4.8863\times 10^{-5}$mm for +LVE and $0.9509\times 10^{-5}$mm for EVE), together with the public dataset +VOCASET ($2.8669\times 10^{-5}$mm for LVE and $0.4664\times 10^{-5}$mm for +EVE), demonstrate that our approach achieves state-of-the-art performance. + +
+
+
+
+
+ + ♻ ☆ Multi-aspect Knowledge Distillation with Large Language Model + + +
+ Recent advancements in deep learning have significantly improved performance +on computer vision tasks. Previous image classification methods primarily +modify model architectures or add features, and they optimize models using +cross-entropy loss on class logits. Since they focus on classifying images with +considering class labels, these methods may struggle to learn various +\emph{aspects} of classes (e.g., natural positions and shape changes). +Rethinking the previous approach from a novel view, we propose a multi-aspect +knowledge distillation method using Multimodal Large Language Models (MLLMs). +Our approach involves: 1) querying Large Language Model with multi-aspect +questions relevant to the knowledge we want to transfer to the model, 2) +extracting corresponding logits from MLLM, and 3) expanding the model's output +dimensions to distill these multi-aspect logits. We then apply cross-entropy +loss to class logits and binary cross-entropy loss to multi-aspect logits. +Through our method, the model can learn not only the knowledge about visual +aspects but also the abstract and complex aspects that require a deeper +understanding. We primarily apply our method to image classification, and to +explore the potential for extending our model, we expand it to other tasks, +such as object detection. In all experimental results, our method improves the +performance of the baselines. Additionally, we analyze the effect of +multi-aspect knowledge distillation. These results demonstrate that our method +can transfer knowledge about various aspects to the model and the aspect +knowledge can enhance model performance in computer vision tasks. This paper +demonstrates the great potential of multi-aspect knowledge distillation, and we +believe it offers a promising direction for future research in computer vision +and beyond. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Manydepth2: Motion-Aware Self-Supervised Multi-Frame Monocular Depth + Estimation in Dynamic Scenes + + +
+ Despite advancements in self-supervised monocular depth estimation, +challenges persist in dynamic scenarios due to the dependence on assumptions +about a static world. In this paper, we present Manydepth2, to achieve precise +depth estimation for both dynamic objects and static backgrounds, all while +maintaining computational efficiency. To tackle the challenges posed by dynamic +content, we incorporate optical flow and coarse monocular depth to create a +pseudo-static reference frame. This frame is then utilized to build a +motion-aware cost volume in collaboration with the vanilla target frame. +Furthermore, to improve the accuracy and robustness of the network +architecture, we propose an attention-based depth network that effectively +integrates information from feature maps at different resolutions by +incorporating both channel and non-local attention mechanisms. Compared to +methods with similar computational costs, Manydepth2 achieves a significant +reduction of approximately five percent in root-mean-square error for +self-supervised monocular depth estimation on the KITTI-2015 dataset. The code +could be found at https://github.com/kaichen-z/Manydepth2. + +
+
+ comment: Monocular Depth Estimation, Self-Supervised, Optical Flow +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 26 + +
+
+
+ + ☆ An FPGA-Based Neuro-Fuzzy Sensor for Personalized Driving Assistance + + +
+ Advanced driving-assistance systems (ADAS) are intended to automatize driver +tasks, as well as improve driving and vehicle safety. This work proposes an +intelligent neuro-fuzzy sensor for driving style (DS) recognition, suitable for +ADAS enhancement. The development of the driving style intelligent sensor uses +naturalistic driving data from the SHRP2 study, which includes data from a CAN +bus, inertial measurement unit, and front radar. The system has been +successfully implemented using a field-programmable gate array (FPGA) device of +the Xilinx Zynq programmable system-on-chip (PSoC). It can mimic the typical +timing parameters of a group of drivers as well as tune these typical +parameters to model individual DSs. The neuro-fuzzy intelligent sensor provides +high-speed real-time active ADAS implementation and is able to personalize its +behavior into safe margins without driver intervention. In particular, the +personalization procedure of the time headway (THW) parameter for an ACC in +steady car following was developed, achieving a performance of 0.53 +microseconds. This performance fulfilled the requirements of cutting-edge +active ADAS specifications. + +
+
+ comment: Journal Article +
+
+
+
+
+ + ☆ 3D Reconstruction of non-visible surfaces of objects from a Single Depth + View -- Comparative Study + + +
+ Scene and object reconstruction is an important problem in robotics, in +particular in planning collision-free trajectories or in object manipulation. +This paper compares two strategies for the reconstruction of nonvisible parts +of the object surface from a single RGB-D camera view. The first method, named +DeepSDF predicts the Signed Distance Transform to the object surface for a +given point in 3D space. The second method, named MirrorNet reconstructs the +occluded objects' parts by generating images from the other side of the +observed object. Experiments performed with objects from the ShapeNet dataset, +show that the view-dependent MirrorNet is faster and has smaller reconstruction +errors in most categories. + +
+
+
+
+
+ + ☆ Underactuated dexterous robotic grasping with reconfigurable passive + joints + + +
+ We introduce a novel reconfigurable passive joint (RP-joint), which has been +implemented and tested on an underactuated three-finger robotic gripper. +RP-joint has no actuation, but instead it is lightweight and compact. It can be +easily reconfigured by applying external forces and locked to perform complex +dexterous manipulation tasks, but only after tension is applied to the +connected tendon. Additionally, we present an approach that allows learning +dexterous grasps from single examples with underactuated grippers and +automatically configures the RP-joints for dexterous manipulation. This is +enhanced by integrating kinaesthetic contact optimization, which improves grasp +performance even further. The proposed RP-joint gripper and grasp planner have +been tested on over 370 grasps executed on 42 IKEA objects and on the YCB +object dataset, achieving grasping success rates of 80% and 87%, on IKEA and +YCB, respectively. + +
+
+
+
+
+ + ☆ Robust Mobile Robot Path Planning via LLM-Based Dynamic Waypoint + Generation + + +
+ Mobile robot path planning in complex environments remains a significant +challenge, especially in achieving efficient, safe and robust paths. The +traditional path planning techniques like DRL models typically trained for a +given configuration of the starting point and target positions, these models +only perform well when these conditions are satisfied. In this paper, we +proposed a novel path planning framework that embeds Large Language Models to +empower mobile robots with the capability of dynamically interpreting natural +language commands and autonomously generating efficient, collision-free +navigation paths. The proposed framework uses LLMs to translate high-level user +inputs into actionable waypoints while dynamically adjusting paths in response +to obstacles. We experimentally evaluated our proposed LLM-based approach +across three different environments of progressive complexity, showing the +robustness of our approach with llama3.1 model that outperformed other LLM +models in path planning time, waypoint generation success rate, and collision +avoidance. This underlines the promising contribution of LLMs for enhancing the +capability of mobile robots, especially when their operation involves complex +decisions in large and complex environments. Our framework has provided safer, +more reliable navigation systems and opened a new direction for the future +research. The source code of this work is publicly available on GitHub. + +
+
+ comment: 18 pages, 6 figures, submitted in Journal Expert Systems with + Applications +
+
+
+
+
+ + ☆ LLM-attacker: Enhancing Closed-loop Adversarial Scenario Generation for + Autonomous Driving with Large Language Models + + +
+ Ensuring and improving the safety of autonomous driving systems (ADS) is +crucial for the deployment of highly automated vehicles, especially in +safety-critical events. To address the rarity issue, adversarial scenario +generation methods are developed, in which behaviors of traffic participants +are manipulated to induce safety-critical events. However, existing methods +still face two limitations. First, identification of the adversarial +participant directly impacts the effectiveness of the generation. However, the +complexity of real-world scenarios, with numerous participants and diverse +behaviors, makes identification challenging. Second, the potential of generated +safety-critical scenarios to continuously improve ADS performance remains +underexplored. To address these issues, we propose LLM-attacker: a closed-loop +adversarial scenario generation framework leveraging large language models +(LLMs). Specifically, multiple LLM agents are designed and coordinated to +identify optimal attackers. Then, the trajectories of the attackers are +optimized to generate adversarial scenarios. These scenarios are iteratively +refined based on the performance of ADS, forming a feedback loop to improve +ADS. Experimental results show that LLM-attacker can create more dangerous +scenarios than other methods, and the ADS trained with it achieves a collision +rate half that of training with normal scenarios. This indicates the ability of +LLM-attacker to test and enhance the safety and robustness of ADS. Video +demonstrations are provided at: +https://drive.google.com/file/d/1Zv4V3iG7825oyiKbUwS2Y-rR0DQIE1ZA/view. + +
+
+
+
+
+ + ☆ SpatialVLA: Exploring Spatial Representations for Visual-Language-Action + Model + + +
+ In this paper, we claim that spatial understanding is the keypoint in robot +manipulation, and propose SpatialVLA to explore effective spatial +representations for the robot foundation model. Specifically, we introduce +Ego3D Position Encoding to inject 3D information into the input observations of +the visual-language-action model, and propose Adaptive Action Grids to +represent spatial robot movement actions with adaptive discretized action +grids, facilitating learning generalizable and transferrable spatial action +knowledge for cross-robot control. SpatialVLA is first pre-trained on top of a +vision-language model with 1.1 Million real-world robot episodes, to learn a +generalist manipulation policy across multiple robot environments and tasks. +After pre-training, SpatialVLA is directly applied to perform numerous tasks in +a zero-shot manner. The superior results in both simulation and real-world +robots demonstrate its advantage of inferring complex robot motion trajectories +and its strong in-domain multi-task generalization ability. We further show the +proposed Adaptive Action Grids offer a new and effective way to fine-tune the +pre-trained SpatialVLA model for new simulation and real-world setups, where +the pre-learned action grids are re-discretized to capture robot-specific +spatial action movements of new setups. The superior results from extensive +evaluations demonstrate the exceptional in-distribution generalization and +out-of-distribution adaptation capability, highlighting the crucial benefit of +the proposed spatial-aware representations for generalist robot policy +learning. All the details and codes will be open-sourced. + +
+
+
+
+
+ + ☆ Autonomous Horizon-based Asteroid Navigation With + Observability-constrained Maneuvers + + +
+ Asteroid exploration is a pertinent challenge due to the varying complexity +of their dynamical environments, shape and communication delays due to +distance. Thus, autonomous navigation methods are continually being developed +and improved in current research to enable their safe exploration. These +methods often involve using horizon-based Optical Navigation (OpNav) to +determine the spacecraft's location, which is reliant on the visibility of the +horizon. It is critical to ensure the reliability of this measurement such that +the spacecraft may maintain an accurate state estimate throughout its mission. +This paper presents an algorithm that generates control maneuvers for +spacecraft to follow trajectories that allow continuously usable optical +measurements to maintain system observability for safe navigation. This +algorithm improves upon existing asteroid navigation capabilities by allowing +the safe and robust autonomous targeting of various trajectories and orbits at +a wide range of distances within optical measurement range. It is adaptable to +different asteroid scenarios. Overall, the approach develops an +all-encompassing system that simulates the asteroid dynamics, synthetic image +generation, edge detection, horizon-based OpNav, filtering and +observability-enhancing control. + +
+
+ comment: 38 pages, 16 figures, preprint under journal review +
+
+
+
+
+ + ☆ Error-State LQR Formulation for Quadrotor UAV Trajectory Tracking + + +
+ This article presents an error-state Linear Quadratic Regulator (LQR) +formulation for robust trajectory tracking in quadrotor Unmanned Aerial +Vehicles (UAVs). The proposed approach leverages error-state dynamics and +employs exponential coordinates to represent orientation errors, enabling a +linearized system representation for real-time control. The control strategy +integrates an LQR-based full-state feedback controller for trajectory tracking, +combined with a cascaded bodyrate controller to handle actuator dynamics. +Detailed derivations of the error-state dynamics, the linearization process, +and the controller design are provided, highlighting the applicability of the +method for precise and stable quadrotor control in dynamic environments. + +
+
+
+
+
+ + ☆ Generalized Mission Planning for Heterogeneous Multi-Robot Teams via + LLM-constructed Hierarchical Trees + + +
+ We present a novel mission-planning strategy for heterogeneous multi-robot +teams, taking into account the specific constraints and capabilities of each +robot. Our approach employs hierarchical trees to systematically break down +complex missions into manageable sub-tasks. We develop specialized APIs and +tools, which are utilized by Large Language Models (LLMs) to efficiently +construct these hierarchical trees. Once the hierarchical tree is generated, it +is further decomposed to create optimized schedules for each robot, ensuring +adherence to their individual constraints and capabilities. We demonstrate the +effectiveness of our framework through detailed examples covering a wide range +of missions, showcasing its flexibility and scalability. + +
+
+
+
+
+ + ☆ Enhanced Position Estimation in Tactile Internet-Enabled Remote Robotic + Surgery Using MOESP-Based Kalman Filter + + +
+ Accurately estimating the position of a patient's side robotic arm in real +time during remote surgery is a significant challenge, especially within +Tactile Internet (TI) environments. This paper presents a new and efficient +method for position estimation using a Kalman Filter (KF) combined with the +Multivariable Output-Error State Space (MOESP) method for system +identification. Unlike traditional approaches that require prior knowledge of +the system's dynamics, this study uses the JIGSAW dataset, a comprehensive +collection of robotic surgical data, along with input from the Master Tool +Manipulator (MTM) to derive the state-space model directly. The MOESP method +allows accurate modeling of the Patient Side Manipulator (PSM) dynamics without +prior system models, improving the KF's performance under simulated network +conditions, including delays, jitter, and packet loss. These conditions mimic +real-world challenges in Tactile Internet applications. The findings +demonstrate the KF's improved resilience and accuracy in state estimation, +achieving over 95 percent accuracy despite network-induced uncertainties. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2406.04503 +
+
+
+
+
+ + ☆ Modular Framework for Uncertainty Prediction in Autonomous Vehicle + Motion Forecasting within Complex Traffic Scenarios + + +
+ We propose a modular modeling framework designed to enhance the capture and +validation of uncertainty in autonomous vehicle (AV) trajectory prediction. +Departing from traditional deterministic methods, our approach employs a +flexible, end-to-end differentiable probabilistic encoder-decoder architecture. +This modular design allows the encoder and decoder to be trained independently, +enabling seamless adaptation to diverse traffic scenarios without retraining +the entire system. Our key contributions include: (1) a probabilistic heatmap +predictor that generates context-aware occupancy grids for dynamic forecasting, +(2) a modular training approach that supports independent component training +and flexible adaptation, and (3) a structured validation scheme leveraging +uncertainty metrics to evaluate robustness under high-risk conditions. To +highlight the benefits of our framework, we benchmark it against an end-to-end +baseline, demonstrating faster convergence, improved stability, and +flexibility. Experimental results validate these advantages, showcasing the +capacity of the framework to efficiently handle complex scenarios while +ensuring reliable predictions and robust uncertainty representation. This +modular design offers significant practical utility and scalability for +real-world autonomous driving applications. + +
+
+
+
+
+ + ☆ BiFold: Bimanual Cloth Folding with Language Guidance ICRA 2025 + + +
+ Cloth folding is a complex task due to the inevitable self-occlusions of +clothes, their complicated dynamics, and the disparate materials, geometries, +and textures that garments can have. In this work, we learn folding actions +conditioned on text commands. Translating high-level, abstract instructions +into precise robotic actions requires sophisticated language understanding and +manipulation capabilities. To do that, we leverage a pre-trained +vision-language model and repurpose it to predict manipulation actions. Our +model, BiFold, can take context into account and achieves state-of-the-art +performance on an existing language-conditioned folding benchmark. Given the +lack of annotated bimanual folding data, we devise a procedure to automatically +parse actions of a simulated dataset and tag them with aligned text +instructions. BiFold attains the best performance on our dataset and can +transfer to new instructions, garments, and environments. + +
+
+ comment: Accepted at ICRA 2025 +
+
+
+
+
+ + ☆ PhysBench: Benchmarking and Enhancing Vision-Language Models for + Physical World Understanding ICLR 2025 + + +
+ Understanding the physical world is a fundamental challenge in embodied AI, +critical for enabling agents to perform complex tasks and operate safely in +real-world environments. While Vision-Language Models (VLMs) have shown great +promise in reasoning and task planning for embodied agents, their ability to +comprehend physical phenomena remains extremely limited. To close this gap, we +introduce PhysBench, a comprehensive benchmark designed to evaluate VLMs' +physical world understanding capability across a diverse set of tasks. +PhysBench contains 100,000 entries of interleaved video-image-text data, +categorized into four major domains: physical object properties, physical +object relationships, physical scene understanding, and physics-based dynamics, +further divided into 19 subclasses and 8 distinct capability dimensions. Our +extensive experiments, conducted on 75 representative VLMs, reveal that while +these models excel in common-sense reasoning, they struggle with understanding +the physical world -- likely due to the absence of physical knowledge in their +training data and the lack of embedded physical priors. To tackle the +shortfall, we introduce PhysAgent, a novel framework that combines the +generalization strengths of VLMs with the specialized expertise of vision +models, significantly enhancing VLMs' physical understanding across a variety +of tasks, including an 18.4\% improvement on GPT-4o. Furthermore, our results +demonstrate that enhancing VLMs' physical world understanding capabilities can +help embodied agents such as MOKA. We believe that PhysBench and PhysAgent +offer valuable insights and contribute to bridging the gap between VLMs and +physical world understanding. + +
+
+ comment: ICLR 2025. Project page: https://physbench.github.io/; Dataset: + https://huggingface.co/datasets/USC-GVL/PhysBench; +
+
+
+
+
+ + ♻ ☆ λ: A Benchmark for Data-Efficiency in Long-Horizon Indoor Mobile + Manipulation Robotics + + +
+ Efficiently learning and executing long-horizon mobile manipulation (MoMa) +tasks is crucial for advancing robotics in household and workplace settings. +However, current MoMa models are data-inefficient, underscoring the need for +improved models that require realistic-sized benchmarks to evaluate their +efficiency, which do not exist. To address this, we introduce the LAMBDA +({\lambda}) benchmark (Long-horizon Actions for Mobile-manipulation +Benchmarking of Directed Activities), which evaluates the data efficiency of +models on language-conditioned, long-horizon, multi-room, multi-floor, +pick-and-place tasks using a dataset of manageable size, more feasible for +collection. The benchmark includes 571 human-collected demonstrations that +provide realism and diversity in simulated and real-world settings. Unlike +planner-generated data, these trajectories offer natural variability and +replay-verifiability, ensuring robust learning and evaluation. We benchmark +several models, including learning-based models and a neuro-symbolic modular +approach combining foundation models with task and motion planning. +Learning-based models show suboptimal success rates, even when leveraging +pretrained weights, underscoring significant data inefficiencies. However, the +neuro-symbolic approach performs significantly better while being more data +efficient. Findings highlight the need for more data-efficient learning-based +MoMa approaches. {\lambda} addresses this gap by serving as a key benchmark for +evaluating the data efficiency of those future models in handling household +robotics tasks. + +
+
+
+
+
+ + ♻ ☆ TelePreview: A User-Friendly Teleoperation System with Virtual Arm + Assistance for Enhanced Effectiveness + + +
+ Teleoperation provides an effective way to collect robot data, which is +crucial for learning from demonstrations. In this field, teleoperation faces +several key challenges: user-friendliness for new users, safety assurance, and +transferability across different platforms. While collecting real robot +dexterous manipulation data by teleoperation to train robots has shown +impressive results on diverse tasks, due to the morphological differences +between human and robot hands, it is not only hard for new users to understand +the action mapping but also raises potential safety concerns during operation. +To address these limitations, we introduce TelePreview. This teleoperation +system offers real-time visual feedback on robot actions based on human user +inputs, with a total hardware cost of less than $1,000. TelePreview allows the +user to see a virtual robot that represents the outcome of the user's next +movement. By enabling flexible switching between command visualization and +actual execution, this system helps new users learn how to demonstrate quickly +and safely. We demonstrate that it outperforms other teleoperation systems +across five tasks, emphasize its ease of use, and highlight its straightforward +deployment across diverse robotic platforms. We release our code and a +deployment document on our website https://nus-lins-lab.github.io/telepreview/. + +
+
+ comment: In submission +
+
+
+
+
+ + ♻ ☆ Visual-Lidar Map Alignment for Infrastructure Inspections + + +
+ Routine and repetitive infrastructure inspections present safety, efficiency, +and consistency challenges as they are performed manually, often in challenging +or hazardous environments. They can also introduce subjectivity and errors into +the process, resulting in undesirable outcomes. Simultaneous localization and +mapping (SLAM) presents an opportunity to generate high-quality 3D maps that +can be used to extract accurate and objective inspection data. Yet, many SLAM +algorithms are limited in their ability to align 3D maps from repeated +inspections in GPS-denied settings automatically. This limitation hinders +practical long-term asset health assessments by requiring tedious manual +alignment for data association across scans from previous inspections. This +paper introduces a versatile map alignment algorithm leveraging both visual and +lidar data for improved place recognition robustness and presents an +infrastructure-focused dataset tailored for consecutive inspections. By +detaching map alignment from SLAM, our approach enhances infrastructure +inspection pipelines, supports monitoring asset degradation over time, and +invigorates SLAM research by permitting exploration beyond existing +multi-session SLAM algorithms. + +
+
+ comment: 8 pages, 8 figures, for associated code see + https://github.com/jakemclaughlin6/vlma +
+
+
+
+
+ + ♻ ☆ Segmentation Dataset for Reinforced Concrete Construction + + +
+ This paper provides a dataset of 14,805 RGB images with segmentation labels +for autonomous robotic inspection of reinforced concrete defects. Baselines for +the YOLOv8L-seg, DeepLabV3, and U-Net segmentation models are established. +Labelling inconsistencies are addressed statistically, and their influence on +model performance is analyzed. An error identification tool is employed to +examine the error modes of the models. The paper demonstrates that YOLOv8L-seg +performs best, achieving a validation mIOU score of up to 0.59. Label +inconsistencies were found to have a negligible effect on model performance, +while the inclusion of more data improved the performance. False negatives were +identified as the primary failure mode. The results highlight the importance of +data availability for the performance of deep learning-based models. The lack +of publicly available data is identified as a significant contributor to false +negatives. To address this, the paper advocates for an increased open-source +approach within the construction community. + +
+
+ comment: The ConRebSeg Dataset can be found under the following DOI: + https://doi.org/10.11583/DTU.26213762 Corresponding code to download + additional data and initialize the dataset under + https://github.com/DTU-PAS/ConRebSeg This work is an accepted manuscript up + for publication in the Elsevier journal "Automation in Construction" +
+
+
+
+
+ + ♻ ☆ SNN-Based Online Learning of Concepts and Action Laws in an Open World + + +
+ We present the architecture of a fully autonomous, bio-inspired cognitive +agent built around a spiking neural network (SNN) implementing the agent's +semantic memory. The agent explores its universe and learns concepts of +objects/situations and of its own actions in a one-shot manner. While +object/situation concepts are unary, action concepts are triples made up of an +initial situation, a motor activity, and an outcome. They embody the agent's +knowledge of its universe's actions laws. Both kinds of concepts have different +degrees of generality. To make decisions the agent queries its semantic memory +for the expected outcomes of envisaged actions and chooses the action to take +on the basis of these predictions. Our experiments show that the agent handles +new situations by appealing to previously learned general concepts and rapidly +modifies its concepts to adapt to environment changes. + +
+
+
+
+
+ + ♻ ☆ Collective Intelligence for 2D Push Manipulations with Mobile Robots RA-L + + +
+ While natural systems often present collective intelligence that allows them +to self-organize and adapt to changes, the equivalent is missing in most +artificial systems. We explore the possibility of such a system in the context +of cooperative 2D push manipulations using mobile robots. Although conventional +works demonstrate potential solutions for the problem in restricted settings, +they have computational and learning difficulties. More importantly, these +systems do not possess the ability to adapt when facing environmental changes. +In this work, we show that by distilling a planner derived from a +differentiable soft-body physics simulator into an attention-based neural +network, our multi-robot push manipulation system achieves better performance +than baselines. In addition, our system also generalizes to configurations not +seen during training and is able to adapt toward task completions when external +turbulence and environmental changes are applied. Supplementary videos can be +found on our project website: https://sites.google.com/view/ciom/home + +
+
+ comment: Published in IEEE Robotics and Automation Letters (RA-L) +
+
+
+
+
+ + ♻ ☆ GenORM: Generalizable One-shot Rope Manipulation with Parameter-Aware + Policy ICRA 2024 + + +
+ Due to the inherent uncertainty in their deformability during motion, +previous methods in rope manipulation often require hundreds of real-world +demonstrations to train a manipulation policy for each rope, even for simple +tasks such as rope goal reaching, which hinder their applications in our +ever-changing world. To address this issue, we introduce GenORM, a framework +that allows the manipulation policy to handle different deformable ropes with a +single real-world demonstration. To achieve this, we augment the policy by +conditioning it on deformable rope parameters and training it with a diverse +range of simulated deformable ropes so that the policy can adjust actions based +on different rope parameters. At the time of inference, given a new rope, +GenORM estimates the deformable rope parameters by minimizing the disparity +between the grid density of point clouds of real-world demonstrations and +simulations. With the help of a differentiable physics simulator, we require +only a single real-world demonstration. Empirical validations on both simulated +and real-world rope manipulation setups clearly show that our method can +manipulate different ropes with a single demonstration and significantly +outperforms the baseline in both environments (62% improvement in in-domain +ropes, and 15% improvement in out-of-distribution ropes in simulation, 26% +improvement in real-world), demonstrating the effectiveness of our approach in +one-shot rope manipulation. + +
+
+ comment: The extended version of this paper, GenDOM, was published in the 2024 + IEEE International Conference on Robotics and Automation (ICRA 2024), + arXiv:2309.09051 +
+
+
+
+
+ + ♻ ☆ GenDOM: Generalizable One-shot Deformable Object Manipulation with + Parameter-Aware Policy ICRA 2024 + + +
+ Due to the inherent uncertainty in their deformability during motion, +previous methods in deformable object manipulation, such as rope and cloth, +often required hundreds of real-world demonstrations to train a manipulation +policy for each object, which hinders their applications in our ever-changing +world. To address this issue, we introduce GenDOM, a framework that allows the +manipulation policy to handle different deformable objects with only a single +real-world demonstration. To achieve this, we augment the policy by +conditioning it on deformable object parameters and training it with a diverse +range of simulated deformable objects so that the policy can adjust actions +based on different object parameters. At the time of inference, given a new +object, GenDOM can estimate the deformable object parameters with only a single +real-world demonstration by minimizing the disparity between the grid density +of point clouds of real-world demonstrations and simulations in a +differentiable physics simulator. Empirical validations on both simulated and +real-world object manipulation setups clearly show that our method can +manipulate different objects with a single demonstration and significantly +outperforms the baseline in both environments (a 62% improvement for in-domain +ropes and a 15% improvement for out-of-distribution ropes in simulation, as +well as a 26% improvement for ropes and a 50% improvement for cloths in the +real world), demonstrating the effectiveness of our approach in one-shot +deformable object manipulation. + +
+
+ comment: Published in the 2024 IEEE International Conference on Robotics and + Automation (ICRA 2024). arXiv admin note: substantial text overlap with + arXiv:2306.09872 +
+
+
+
+
+ + ♻ ☆ Multi-Agent Behavior Retrieval: Retrieval-Augmented Policy Training for + Cooperative Push Manipulation by Mobile Robots IROS 2024 + + +
+ Due to the complex interactions between agents, learning multi-agent control +policy often requires a prohibited amount of data. This paper aims to enable +multi-agent systems to effectively utilize past memories to adapt to novel +collaborative tasks in a data-efficient fashion. We propose the Multi-Agent +Coordination Skill Database, a repository for storing a collection of +coordinated behaviors associated with key vectors distinctive to them. Our +Transformer-based skill encoder effectively captures spatio-temporal +interactions that contribute to coordination and provides a unique skill +representation for each coordinated behavior. By leveraging only a small number +of demonstrations of the target task, the database enables us to train the +policy using a dataset augmented with the retrieved demonstrations. +Experimental evaluations demonstrate that our method achieves a significantly +higher success rate in push manipulation tasks compared with baseline methods +like few-shot imitation learning. Furthermore, we validate the effectiveness of +our retrieve-and-learn framework in a real environment using a team of wheeled +robots. + +
+
+ comment: Published in the 2024 IEEE/RSJ International Conference on + Intelligent Robots and Systems (IROS 2024) +
+
+
+
+
+ + ♻ ☆ QuIP: Experimental design for expensive simulators with many Qualitative + factors via Integer Programming + + +
+ The need to explore and/or optimize expensive simulators with many +qualitative factors arises in broad scientific and engineering problems. Our +motivating application lies in path planning - the exploration of feasible +paths for navigation, which plays an important role in robotics, surgical +planning and assembly planning. Here, the feasibility of a path is evaluated +via expensive virtual experiments, and its parameter space is typically +discrete and high-dimensional. A carefully selected experimental design is thus +essential for timely decision-making. We propose here a novel framework, called +QuIP, for experimental design of Qualitative factors via Integer Programming +under a Gaussian process surrogate model with an exchangeable covariance +function. For initial design, we show that its asymptotic D-optimal design can +be formulated as a variant of the well-known assignment problem in operations +research, which can be efficiently solved to global optimality using +state-of-the-art integer programming solvers. For sequential design +(specifically, for active learning or black-box optimization), we show that its +design criterion can similarly be formulated as an assignment problem, thus +enabling efficient and reliable optimization with existing solvers. We then +demonstrate the effectiveness of QuIP over existing methods in a suite of path +planning experiments and an application to rover trajectory optimization. + +
+
+
+
+
+ + ♻ ☆ GCBF+: A Neural Graph Control Barrier Function Framework for Distributed + Safe Multi-Agent Control + + +
+ Distributed, scalable, and safe control of large-scale multi-agent systems is +a challenging problem. In this paper, we design a distributed framework for +safe multi-agent control in large-scale environments with obstacles, where a +large number of agents are required to maintain safety using only local +information and reach their goal locations. We introduce a new class of +certificates, termed graph control barrier function (GCBF), which are based on +the well-established control barrier function theory for safety guarantees and +utilize a graph structure for scalable and generalizable distributed control of +MAS. We develop a novel theoretical framework to prove the safety of an +arbitrary-sized MAS with a single GCBF. We propose a new training framework +GCBF+ that uses graph neural networks to parameterize a candidate GCBF and a +distributed control policy. The proposed framework is distributed and is +capable of taking point clouds from LiDAR, instead of actual state information, +for real-world robotic applications. We illustrate the efficacy of the proposed +method through various hardware experiments on a swarm of drones with +objectives ranging from exchanging positions to docking on a moving target +without collision. Additionally, we perform extensive numerical experiments, +where the number and density of agents, as well as the number of obstacles, +increase. Empirical results show that in complex environments with agents with +nonlinear dynamics (e.g., Crazyflie drones), GCBF+ outperforms the hand-crafted +CBF-based method with the best performance by up to 20% for relatively +small-scale MAS with up to 256 agents, and leading reinforcement learning (RL) +methods by up to 40% for MAS with 1024 agents. Furthermore, the proposed method +does not compromise on the performance, in terms of goal reaching, for +achieving high safety rates, which is a common trade-off in RL-based methods. + +
+
+ comment: 20 pages, 15 figures; Accepted by IEEE Transactions on Robotics + (T-RO) +
+
+
+
+
+ + ♻ ☆ Automated Planning Domain Inference for Task and Motion Planning ICRA + + +
+ Task and motion planning (TAMP) frameworks address long and complex planning +problems by integrating high-level task planners with low-level motion +planners. However, existing TAMP methods rely heavily on the manual design of +planning domains that specify the preconditions and postconditions of all +high-level actions. This paper proposes a method to automate planning domain +inference from a handful of test-time trajectory demonstrations, reducing the +reliance on human design. Our approach incorporates a deep learning-based +estimator that predicts the appropriate components of a domain for a new task +and a search algorithm that refines this prediction, reducing the size and +ensuring the utility of the inferred domain. Our method is able to generate new +domains from minimal demonstrations at test time, enabling robots to handle +complex tasks more efficiently. We demonstrate that our approach outperforms +behavior cloning baselines, which directly imitate planner behavior, in terms +of planning performance and generalization across a variety of tasks. +Additionally, our method reduces computational costs and data amount +requirements at test time for inferring new planning domains. + +
+
+ comment: Accepted to 2025 International Conference on Robotics and + Automation(ICRA) 8 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Decentralized Structural-RNN for Robot Crowd Navigation with Deep + Reinforcement Learning ICRA + + +
+ Safe and efficient navigation through human crowds is an essential capability +for mobile robots. Previous work on robot crowd navigation assumes that the +dynamics of all agents are known and well-defined. In addition, the performance +of previous methods deteriorates in partially observable environments and +environments with dense crowds. To tackle these problems, we propose +decentralized structural-Recurrent Neural Network (DS-RNN), a novel network +that reasons about spatial and temporal relationships for robot decision making +in crowd navigation. We train our network with model-free deep reinforcement +learning without any expert supervision. We demonstrate that our model +outperforms previous methods in challenging crowd navigation scenarios. We +successfully transfer the policy learned in the simulator to a real-world +TurtleBot 2i. For more information, please visit the project website at +https://sites.google.com/view/crowdnav-ds-rnn/home. + +
+
+ comment: Published as a conference paper in IEEE International Conference on + Robotics and Automation (ICRA), 2021 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 106 + +
+
+
+ + ☆ RelightVid: Temporal-Consistent Diffusion Model for Video Relighting + + +
+ Diffusion models have demonstrated remarkable success in image generation and +editing, with recent advancements enabling albedo-preserving image relighting. +However, applying these models to video relighting remains challenging due to +the lack of paired video relighting datasets and the high demands for output +fidelity and temporal consistency, further complicated by the inherent +randomness of diffusion models. To address these challenges, we introduce +RelightVid, a flexible framework for video relighting that can accept +background video, text prompts, or environment maps as relighting conditions. +Trained on in-the-wild videos with carefully designed illumination +augmentations and rendered videos under extreme dynamic lighting, RelightVid +achieves arbitrary video relighting with high temporal consistency without +intrinsic decomposition while preserving the illumination priors of its image +backbone. + +
+
+
+
+
+ + ☆ Adaptive Iterative Compression for High-Resolution Files: an Approach + Focused on Preserving Visual Quality in Cinematic Workflows + + +
+ This study presents an iterative adaptive compression model for +high-resolution DPX-derived TIFF files used in cinematographic workflows and +digital preservation. The model employs SSIM and PSNR metrics to dynamically +adjust compression parameters across three configurations (C0, C1, C2), +achieving storage reductions up to 83.4 % while maintaining high visual +fidelity (SSIM > 0.95). Validation across three diverse productions - black and +white classic, soft-palette drama, and complex action film - demonstrated the +method's effectiveness in preserving critical visual elements while +significantly reducing storage requirements. Professional evaluators reported +90% acceptance rate for the optimal C1 configuration, with artifacts remaining +below perceptual threshold in critical areas. Comparative analysis with +JPEG2000 and H.265 showed superior quality preservation at equivalent +compression rates, particularly for high bit-depth content. While requiring +additional computational overhead, the method's storage benefits and quality +control capabilities make it suitable for professional workflows, with +potential applications in medical imaging and cloud storage optimization. + +
+
+
+
+
+ + ☆ LinPrim: Linear Primitives for Differentiable Volumetric Rendering + + +
+ Volumetric rendering has become central to modern novel view synthesis +methods, which use differentiable rendering to optimize 3D scene +representations directly from observed views. While many recent works build on +NeRF or 3D Gaussians, we explore an alternative volumetric scene +representation. More specifically, we introduce two new scene representations +based on linear primitives-octahedra and tetrahedra-both of which define +homogeneous volumes bounded by triangular faces. This formulation aligns +naturally with standard mesh-based tools, minimizing overhead for downstream +applications. To optimize these primitives, we present a differentiable +rasterizer that runs efficiently on GPUs, allowing end-to-end gradient-based +optimization while maintaining realtime rendering capabilities. Through +experiments on real-world datasets, we demonstrate comparable performance to +state-of-the-art volumetric methods while requiring fewer primitives to achieve +similar reconstruction fidelity. Our findings provide insights into the +geometry of volumetric rendering and suggest that adopting explicit polyhedra +can expand the design space of scene representations. + +
+
+
+
+
+ + ☆ Large Models in Dialogue for Active Perception and Anomaly Detection + + +
+ Autonomous aerial monitoring is an important task aimed at gathering +information from areas that may not be easily accessible by humans. At the same +time, this task often requires recognizing anomalies from a significant +distance or not previously encountered in the past. In this paper, we propose a +novel framework that leverages the advanced capabilities provided by Large +Language Models (LLMs) to actively collect information and perform anomaly +detection in novel scenes. To this end, we propose an LLM based model dialogue +approach, in which two deep learning models engage in a dialogue to actively +control a drone to increase perception and anomaly detection accuracy. We +conduct our experiments in a high fidelity simulation environment where an LLM +is provided with a predetermined set of natural language movement commands +mapped into executable code functions. Additionally, we deploy a multimodal +Visual Question Answering (VQA) model charged with the task of visual question +answering and captioning. By engaging the two models in conversation, the LLM +asks exploratory questions while simultaneously flying a drone into different +parts of the scene, providing a novel way to implement active perception. By +leveraging LLMs reasoning ability, we output an improved detailed description +of the scene going beyond existing static perception approaches. In addition to +information gathering, our approach is utilized for anomaly detection and our +results demonstrate the proposed methods effectiveness in informing and +alerting about potential hazards. + +
+
+ comment: Accepted to International Conference of Pattern Recognition (ICPR + 2024) +
+
+
+
+
+ + ☆ FALCON: Resolving Visual Redundancy and Fragmentation in High-resolution + Multimodal Large Language Models via Visual Registers + + +
+ The incorporation of high-resolution visual input equips multimodal large +language models (MLLMs) with enhanced visual perception capabilities for +real-world tasks. However, most existing high-resolution MLLMs rely on a +cropping-based approach to process images, which leads to fragmented visual +encoding and a sharp increase in redundant tokens. To tackle these issues, we +propose the FALCON model. FALCON introduces a novel visual register technique +to simultaneously: 1) Eliminate redundant tokens at the stage of visual +encoding. To directly address the visual redundancy present in the output of +vision encoder, we propose a Register-based Representation Compacting +(ReCompact) mechanism. This mechanism introduces a set of learnable visual +registers designed to adaptively aggregate essential information while +discarding redundancy. It enables the encoder to produce a more compact visual +representation with a minimal number of output tokens, thus eliminating the +need for an additional compression module. 2) Ensure continuity in visual +encoding. To address the potential encoding errors caused by fragmented visual +inputs, we develop a Register Interactive Attention (ReAtten) module. This +module facilitates effective and efficient information exchange across +sub-images by enabling interactions between visual registers. It ensures the +continuity of visual semantics throughout the encoding. We conduct +comprehensive experiments with FALCON on high-resolution benchmarks across a +wide range of scenarios. FALCON demonstrates superior performance with a +remarkable 9-fold and 16-fold reduction in visual tokens. + +
+
+
+
+
+ + ☆ Mixture-of-Mamba: Enhancing Multi-Modal State-Space Models with + Modality-Aware Sparsity + + +
+ State Space Models (SSMs) have emerged as efficient alternatives to +Transformers for sequential modeling, but their inability to leverage +modality-specific features limits their performance in multi-modal pretraining. +Here, we propose Mixture-of-Mamba, a novel SSM architecture that introduces +modality-aware sparsity through modality-specific parameterization of the Mamba +block. Building on Mixture-of-Transformers (W. Liang et al. arXiv:2411.04996; +2024), we extend the benefits of modality-aware sparsity to SSMs while +preserving their computational efficiency. We evaluate Mixture-of-Mamba across +three multi-modal pretraining settings: Transfusion (interleaved text and +continuous image tokens with diffusion loss), Chameleon (interleaved text and +discrete image tokens), and an extended three-modality framework incorporating +speech. Mixture-of-Mamba consistently reaches the same loss values at earlier +training steps with significantly reduced computational costs. In the +Transfusion setting, Mixture-of-Mamba achieves equivalent image loss using only +34.76% of the training FLOPs at the 1.4B scale. In the Chameleon setting, +Mixture-of-Mamba reaches similar image loss with just 42.50% of the FLOPs at +the 1.4B scale, and similar text loss with just 65.40% of the FLOPs. In the +three-modality setting, MoM matches speech loss at 24.80% of the FLOPs at the +1.4B scale. Our ablation study highlights the synergistic effects of decoupling +projection components, where joint decoupling yields greater gains than +individual modifications. These results establish modality-aware sparsity as a +versatile and effective design principle, extending its impact from +Transformers to SSMs and setting new benchmarks in multi-modal pretraining. Our +code can be accessed at https://github.com/Weixin-Liang/Mixture-of-Mamba + +
+
+
+
+
+ + ☆ Multi-view Structural Convolution Network for Domain-Invariant Point + Cloud Recognition of Autonomous Vehicles + + +
+ Point cloud representation has recently become a research hotspot in the +field of computer vision and has been utilized for autonomous vehicles. +However, adapting deep learning networks for point cloud data recognition is +challenging due to the variability in datasets and sensor technologies. This +variability underscores the necessity for adaptive techniques to maintain +accuracy under different conditions. In this paper, we present the Multi-View +Structural Convolution Network (MSCN) designed for domain-invariant point cloud +recognition. MSCN comprises Structural Convolution Layers (SCL) that extract +local context geometric features from point clouds and Structural Aggregation +Layers (SAL) that extract and aggregate both local and overall context features +from point clouds. Additionally, our MSCN enhances feature representation +robustness by training with unseen domain point clouds derived from source +domain point clouds. This method acquires domain-invariant features and +exhibits robust, consistent performance across various point cloud datasets, +ensuring compatibility with diverse sensor configurations without the need for +parameter adjustments. This highlights MSCN's potential to significantly +improve the reliability and domain invariant features in different +environments. Our code is available at https://github.com/MLMLab/MSCN. + +
+
+ comment: 16 pages, 6 figures +
+
+
+
+
+ + Brain-Adapter: Enhancing Neurological Disorder Analysis with + Adapter-Tuning Multimodal Large Language Models + + +
+ Understanding brain disorders is crucial for accurate clinical diagnosis and +treatment. Recent advances in Multimodal Large Language Models (MLLMs) offer a +promising approach to interpreting medical images with the support of text +descriptions. However, previous research has primarily focused on 2D medical +images, leaving richer spatial information of 3D images under-explored, and +single-modality-based methods are limited by overlooking the critical clinical +information contained in other modalities. To address this issue, this paper +proposes Brain-Adapter, a novel approach that incorporates an extra bottleneck +layer to learn new knowledge and instill it into the original pre-trained +knowledge. The major idea is to incorporate a lightweight bottleneck layer to +train fewer parameters while capturing essential information and utilize a +Contrastive Language-Image Pre-training (CLIP) strategy to align multimodal +data within a unified representation space. Extensive experiments demonstrated +the effectiveness of our approach in integrating multimodal data to +significantly improve the diagnosis accuracy without high computational costs, +highlighting the potential to enhance real-world diagnostic workflows. + +
+
+
+
+
+ + ☆ Return of the Encoder: Maximizing Parameter Efficiency for SLMs + + +
+ The dominance of large decoder-only language models has overshadowed +encoder-decoder architectures, despite their fundamental efficiency advantages +in sequence processing. For small language models (SLMs) - those with 1 billion +parameters or fewer - our systematic analysis across GPU, CPU, and NPU +platforms reveals that encoder-decoder architectures achieve 47% lower +first-token latency and 4.7x higher throughput compared to decoder-only models +on edge devices. These gains may be attributed to encoder-decoder's one-time +input processing and efficient separation of understanding and generation +phases. + We introduce a novel knowledge distillation framework that enables +encoder-decoder models to leverage capabilities from large scalable +decoder-only teachers while preserving their architectural advantages, +achieving up to 6 average performance points improvement across diverse tasks, +with significant gains in asymmetric sequence tasks where input and output +distributions can benefit from different processing approaches. + When combined with modern advances like Rotary Positional Embeddings (RoPE) +and Vision encoders, our systematic investigation demonstrates that +encoder-decoder architectures provide a more practical path toward deploying +capable language models in resource-constrained environments. Our findings +challenge the prevailing trend toward decoder-only scaling, showing that +architectural choices become increasingly crucial as parameter budgets +decrease, particularly for on-device and edge deployments where computational +efficiency is paramount. + +
+
+ comment: 13 pages, 5 figures. LLMs/SLMs, encoder-decoder and decoder-only +
+
+
+
+
+ + ☆ Lightweight Weighted Average Ensemble Model for Pneumonia Detection in + Chest X-Ray Images + + +
+ Pneumonia is a leading cause of illness and death in children, underscoring +the need for early and accurate detection. In this study, we propose a novel +lightweight ensemble model for detecting pneumonia in children using chest +X-ray images. This ensemble model integrates two pre-trained convolutional +neural networks (CNNs), MobileNetV2 and NASNetMobile, selected for their +balance of computational efficiency and accuracy. These models were fine-tuned +on a pediatric chest X-ray dataset and combined to enhance classification +performance. Our proposed ensemble model achieved a classification accuracy of +98.63%, significantly outperforming individual models such as MobileNetV2 +(97.10%) and NASNetMobile(96.25%) in terms of accuracy, precision, recall, and +F1 score. Moreover, the ensemble model outperformed state-of-the-art +architectures, including ResNet50, InceptionV3, and DenseNet201, while +maintaining computational efficiency. The proposed lightweight ensemble model +presents a highly effective and resource-efficient solution for pneumonia +detection, making it particularly suitable for deployment in +resource-constrained settings. + +
+
+ comment: Corresponding authors: Shanthi Karpurapu + (shanthi.karpurapu@gmail.com), Suresh Babu Nettur (nettursuresh@gmail.com) +
+
+
+
+
+ + ☆ CLISC: Bridging clip and sam by enhanced cam for unsupervised brain + tumor segmentation + + +
+ Brain tumor segmentation is important for diagnosis of the tumor, and current +deep-learning methods rely on a large set of annotated images for training, +with high annotation costs. Unsupervised segmentation is promising to avoid +human annotations while the performance is often limited. In this study, we +present a novel unsupervised segmentation approach that leverages the +capabilities of foundation models, and it consists of three main steps: (1) A +vision-language model (i.e., CLIP) is employed to obtain image-level +pseudo-labels for training a classification network. Class Activation Mapping +(CAM) is then employed to extract Regions of Interest (ROIs), where an adaptive +masking-based data augmentation is used to enhance ROI identification.(2) The +ROIs are used to generate bounding box and point prompts for the Segment +Anything Model (SAM) to obtain segmentation pseudo-labels. (3) A 3D +segmentation network is trained with the SAM-derived pseudo-labels, where +low-quality pseudo-labels are filtered out in a self-learning process based on +the similarity between the SAM's output and the network's prediction. +Evaluation on the BraTS2020 dataset demonstrates that our approach obtained an +average Dice Similarity Score (DSC) of 85.60%, outperforming five +state-of-the-art unsupervised segmentation methods by more than 10 percentage +points. Besides, our approach outperforms directly using SAM for zero-shot +inference, and its performance is close to fully supervised learning. + +
+
+ comment: 22st IEEE International Symposium on Biomedical Imaging (ISBI 2025) +
+
+
+
+
+ + ☆ Distilling foundation models for robust and efficient models in digital + pathology + + +
+ In recent years, the advent of foundation models (FM) for digital pathology +has relied heavily on scaling the pre-training datasets and the model size, +yielding large and powerful models. While it resulted in improving the +performance on diverse downstream tasks, it also introduced increased +computational cost and inference time. In this work, we explore the +distillation of a large foundation model into a smaller one, reducing the +number of parameters by several orders of magnitude. Leveraging distillation +techniques, our distilled model, H0-mini, achieves nearly comparable +performance to large FMs at a significantly reduced inference cost. It is +evaluated on several public benchmarks, achieving 3rd place on the HEST +benchmark and 5th place on the EVA benchmark. Additionally, a robustness +analysis conducted on the PLISM dataset demonstrates that our distilled model +reaches excellent robustness to variations in staining and scanning conditions, +significantly outperforming other state-of-the art models. This opens new +perspectives to design lightweight and robust models for digital pathology, +without compromising on performance. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ PDC-ViT : Source Camera Identification using Pixel Difference + Convolution and Vision Transformer + + +
+ Source camera identification has emerged as a vital solution to unlock +incidents involving critical cases like terrorism, violence, and other criminal +activities. The ability to trace the origin of an image/video can aid law +enforcement agencies in gathering evidence and constructing the timeline of +events. Moreover, identifying the owner of a certain device narrows down the +area of search in a criminal investigation where smartphone devices are +involved. This paper proposes a new pixel-based method for source camera +identification, integrating Pixel Difference Convolution (PDC) with a Vision +Transformer network (ViT), and named PDC-ViT. While the PDC acts as the +backbone for feature extraction by exploiting Angular PDC (APDC) and Radial PDC +(RPDC). These techniques enhance the capability to capture subtle variations in +pixel information, which are crucial for distinguishing between different +source cameras. The second part of the methodology focuses on classification, +which is based on a Vision Transformer network. Unlike traditional methods that +utilize image patches directly for training the classification network, the +proposed approach uniquely inputs PDC features into the Vision Transformer +network. To demonstrate the effectiveness of the PDC-ViT approach, it has been +assessed on five different datasets, which include various image contents and +video scenes. The method has also been compared with state-of-the-art source +camera identification methods. Experimental results demonstrate the +effectiveness and superiority of the proposed system in terms of accuracy and +robustness when compared to its competitors. For example, our proposed PDC-ViT +has achieved an accuracy of 94.30%, 84%, 94.22% and 92.29% using the Vision +dataset, Daxing dataset, Socrates dataset and QUFVD dataset, respectively. + +
+
+
+
+
+ + ☆ SPECIAL: Zero-shot Hyperspectral Image Classification With CLIP + + +
+ Hyperspectral image (HSI) classification aims at categorizing each pixel in +an HSI into a specific land cover class, which is crucial for applications like +remote sensing, environmental monitoring, and agriculture. Although deep +learning-based HSI classification methods have achieved significant +advancements, existing methods still rely on manually labeled data for +training, which is both time-consuming and labor-intensive.To address this +limitation, we introduce a novel zero-shot hyperspectral image classification +framework based on CLIP (SPECIAL), aiming to eliminate the need for manual +annotations. The SPECIAL framework consists of two main stages: (1) CLIP-based +pseudo-label generation, and (2) noisy label learning. In the first stage, HSI +is spectrally interpolated to produce RGB bands. These bands are subsequently +classified using CLIP, resulting in noisy pseudo-labels that are accompanied by +confidence scores.To improve the quality of these labels, we propose a scaling +strategy that fuses predictions from multiple spatial scales. In the second +stage, spectral information and a label refinement technique are incorporated +to mitigate label noise and further enhance classification accuracy. +Experimental results on three benchmark datasets demonstrate that our SPECIAL +outperforms existing methods in zero-shot HSI classification, showing its +potential for more practical applications. The code is available at +https://github.com/LiPang/SPECIAL. + +
+
+
+
+
+ + ☆ Automatic Calibration of a Multi-Camera System with Limited Overlapping + Fields of View for 3D Surgical Scene Reconstruction + + +
+ Purpose: The purpose of this study is to develop an automated and accurate +external camera calibration method for multi-camera systems used in 3D surgical +scene reconstruction (3D-SSR), eliminating the need for operator intervention +or specialized expertise. The method specifically addresses the problem of +limited overlapping fields of view caused by significant variations in optical +zoom levels and camera locations. + Methods: We contribute a novel, fast, and fully automatic calibration method +based on the projection of multi-scale markers (MSMs) using a ceiling-mounted +projector. MSMs consist of 2D patterns projected at varying scales, ensuring +accurate extraction of well distributed point correspondences across +significantly different viewpoints and zoom levels. Validation is performed +using both synthetic and real data captured in a mock-up OR, with comparisons +to traditional manual marker-based methods as well as markerless calibration +methods. + Results: The method achieves accuracy comparable to manual, +operator-dependent calibration methods while exhibiting higher robustness under +conditions of significant differences in zoom levels. Additionally, we show +that state-of-the-art Structure-from-Motion (SfM) pipelines are ineffective in +3D-SSR settings, even when additional texture is projected onto the OR floor. + Conclusion: The use of a ceiling-mounted entry-level projector proves to be +an effective alternative to operator-dependent, traditional marker-based +methods, paving the way for fully automated 3D-SSR. + +
+
+
+
+
+ + ☆ UDBE: Unsupervised Diffusion-based Brightness Enhancement in Underwater + Images ICML + + +
+ Activities in underwater environments are paramount in several scenarios, +which drives the continuous development of underwater image enhancement +techniques. A major challenge in this domain is the depth at which images are +captured, with increasing depth resulting in a darker environment. Most +existing methods for underwater image enhancement focus on noise removal and +color adjustment, with few works dedicated to brightness enhancement. This work +introduces a novel unsupervised learning approach to underwater image +enhancement using a diffusion model. Our method, called UDBE, is based on +conditional diffusion to maintain the brightness details of the unpaired input +images. The input image is combined with a color map and a Signal-Noise +Relation map (SNR) to ensure stable training and prevent color distortion in +the output images. The results demonstrate that our approach achieves an +impressive accuracy rate in the datasets UIEB, SUIM and RUIE, well-established +underwater image benchmarks. Additionally, the experiments validate the +robustness of our approach, regarding the image quality metrics PSNR, SSIM, +UIQM, and UISM, indicating the good performance of the brightness enhancement +process. The source code is available here: https://github.com/gusanagy/UDBE. + +
+
+ comment: Paper presented at ICMLA 2024 +
+
+
+
+
+ + ☆ The Linear Attention Resurrection in Vision Transformer + + +
+ Vision Transformers (ViTs) have recently taken computer vision by storm. +However, the softmax attention underlying ViTs comes with a quadratic +complexity in time and memory, hindering the application of ViTs to +high-resolution images. We revisit the attention design and propose a linear +attention method to address the limitation, which doesn't sacrifice ViT's core +advantage of capturing global representation like existing methods (e.g. local +window attention of Swin). We further investigate the key difference between +linear attention and softmax attention. Our empirical results suggest that +linear attention lacks a fundamental property of concentrating the distribution +of the attention matrix. Inspired by this observation, we introduce a local +concentration module to enhance linear attention. By incorporating enhanced +linear global attention and local window attention, we propose a new ViT +architecture, dubbed L$^2$ViT. Notably, L$^2$ViT can effectively capture both +global interactions and local representations while enjoying linear +computational complexity. Extensive experiments demonstrate the strong +performance of L$^2$ViT. On image classification, L$^2$ViT achieves 84.4% Top-1 +accuracy on ImageNet-1K without any extra training data or label. By further +pre-training on ImageNet-22k, it attains 87.0% when fine-tuned with resolution +384$^2$. For downstream tasks, L$^2$ViT delivers favorable performance as a +backbone on object detection as well as semantic segmentation. + +
+
+
+
+
+ + ☆ BAG: Body-Aligned 3D Wearable Asset Generation + + +
+ While recent advancements have shown remarkable progress in general 3D shape +generation models, the challenge of leveraging these approaches to +automatically generate wearable 3D assets remains unexplored. To this end, we +present BAG, a Body-aligned Asset Generation method to output 3D wearable asset +that can be automatically dressed on given 3D human bodies. This is achived by +controlling the 3D generation process using human body shape and pose +information. Specifically, we first build a general single-image to consistent +multiview image diffusion model, and train it on the large Objaverse dataset to +achieve diversity and generalizability. Then we train a Controlnet to guide the +multiview generator to produce body-aligned multiview images. The control +signal utilizes the multiview 2D projections of the target human body, where +pixel values represent the XYZ coordinates of the body surface in a canonical +space. The body-conditioned multiview diffusion generates body-aligned +multiview images, which are then fed into a native 3D diffusion model to +produce the 3D shape of the asset. Finally, by recovering the similarity +transformation using multiview silhouette supervision and addressing asset-body +penetration with physics simulators, the 3D asset can be accurately fitted onto +the target human body. Experimental results demonstrate significant advantages +over existing methods in terms of image prompt-following capability, shape +diversity, and shape quality. Our project page is available at +https://bag-3d.github.io/. + +
+
+ comment: video: https://youtu.be/XJtG82LjQKc +
+
+
+
+
+ + ☆ Efficient Portrait Matte Creation With Layer Diffusion and Connectivity + Priors + + +
+ Learning effective deep portrait matting models requires training data of +both high quality and large quantity. Neither quality nor quantity can be +easily met for portrait matting, however. Since the most accurate ground-truth +portrait mattes are acquired in front of the green screen, it is almost +impossible to harvest a large-scale portrait matting dataset in reality. This +work shows that one can leverage text prompts and the recent Layer Diffusion +model to generate high-quality portrait foregrounds and extract latent portrait +mattes. However, the portrait mattes cannot be readily in use due to +significant generation artifacts. Inspired by the connectivity priors observed +in portrait images, that is, the border of portrait foregrounds always appears +connected, a connectivity-aware approach is introduced to refine portrait +mattes. Building on this, a large-scale portrait matting dataset is created, +termed LD-Portrait-20K, with $20,051$ portrait foregrounds and high-quality +alpha mattes. Extensive experiments demonstrated the value of the +LD-Portrait-20K dataset, with models trained on it significantly outperforming +those trained on other datasets. In addition, comparisons with the chroma +keying algorithm and an ablation study on dataset capacity further confirmed +the effectiveness of the proposed matte creation approach. Further, the dataset +also contributes to state-of-the-art video portrait matting, implemented by +simple video segmentation and a trimap-based image matting model trained on +this dataset. + +
+
+
+
+
+ + ☆ Toward Efficient Generalization in 3D Human Pose Estimation via a + Canonical Domain Approach + + +
+ Recent advancements in deep learning methods have significantly improved the +performance of 3D Human Pose Estimation (HPE). However, performance degradation +caused by domain gaps between source and target domains remains a major +challenge to generalization, necessitating extensive data augmentation and/or +fine-tuning for each specific target domain. To address this issue more +efficiently, we propose a novel canonical domain approach that maps both the +source and target domains into a unified canonical domain, alleviating the need +for additional fine-tuning in the target domain. To construct the canonical +domain, we introduce a canonicalization process to generate a novel canonical +2D-3D pose mapping that ensures 2D-3D pose consistency and simplifies 2D-3D +pose patterns, enabling more efficient training of lifting networks. The +canonicalization of both domains is achieved through the following steps: (1) +in the source domain, the lifting network is trained within the canonical +domain; (2) in the target domain, input 2D poses are canonicalized prior to +inference by leveraging the properties of perspective projection and known +camera intrinsics. Consequently, the trained network can be directly applied to +the target domain without requiring additional fine-tuning. Experiments +conducted with various lifting networks and publicly available datasets (e.g., +Human3.6M, Fit3D, MPI-INF-3DHP) demonstrate that the proposed method +substantially improves generalization capability across datasets while using +the same data volume. + +
+
+ comment: 15 pages, 6 figures +
+
+
+
+
+ + ☆ 3D Reconstruction of non-visible surfaces of objects from a Single Depth + View -- Comparative Study + + +
+ Scene and object reconstruction is an important problem in robotics, in +particular in planning collision-free trajectories or in object manipulation. +This paper compares two strategies for the reconstruction of nonvisible parts +of the object surface from a single RGB-D camera view. The first method, named +DeepSDF predicts the Signed Distance Transform to the object surface for a +given point in 3D space. The second method, named MirrorNet reconstructs the +occluded objects' parts by generating images from the other side of the +observed object. Experiments performed with objects from the ShapeNet dataset, +show that the view-dependent MirrorNet is faster and has smaller reconstruction +errors in most categories. + +
+
+
+
+
+ + ☆ Automated Detection of Sport Highlights from Audio and Video Sources + + +
+ This study presents a novel Deep Learning-based and lightweight approach for +the automated detection of sports highlights (HLs) from audio and video +sources. HL detection is a key task in sports video analysis, traditionally +requiring significant human effort. Our solution leverages Deep Learning (DL) +models trained on relatively small datasets of audio Mel-spectrograms and +grayscale video frames, achieving promising accuracy rates of 89% and 83% for +audio and video detection, respectively. The use of small datasets, combined +with simple architectures, demonstrates the practicality of our method for fast +and cost-effective deployment. Furthermore, an ensemble model combining both +modalities shows improved robustness against false positives and false +negatives. The proposed methodology offers a scalable solution for automated HL +detection across various types of sports video content, reducing the need for +manual intervention. Future work will focus on enhancing model architectures +and extending this approach to broader scene-detection tasks in media analysis. + +
+
+
+
+
+ + ☆ ARFlow: Autogressive Flow with Hybrid Linear Attention + + +
+ Flow models are effective at progressively generating realistic images, but +they generally struggle to capture long-range dependencies during the +generation process as they compress all the information from previous time +steps into a single corrupted image. To address this limitation, we propose +integrating autoregressive modeling -- known for its excellence in modeling +complex, high-dimensional joint probability distributions -- into flow models. +During training, at each step, we construct causally-ordered sequences by +sampling multiple images from the same semantic category and applying different +levels of noise, where images with higher noise levels serve as causal +predecessors to those with lower noise levels. This design enables the model to +learn broader category-level variations while maintaining proper causal +relationships in the flow process. During generation, the model +autoregressively conditions the previously generated images from earlier +denoising steps, forming a contextual and coherent generation trajectory. +Additionally, we design a customized hybrid linear attention mechanism tailored +to our modeling approach to enhance computational efficiency. Our approach, +termed ARFlow, under 400k training steps, achieves 14.08 FID scores on ImageNet +at 128 * 128 without classifier-free guidance, reaching 4.34 FID with +classifier-free guidance 1.5, significantly outperforming the previous +flow-based model SiT's 9.17 FID. Extensive ablation studies demonstrate the +effectiveness of our modeling strategy and chunk-wise attention design. + +
+
+
+
+
+ + ☆ CILP-FGDI: Exploiting Vision-Language Model for Generalizable Person + Re-Identification + + +
+ The Visual Language Model, known for its robust cross-modal capabilities, has +been extensively applied in various computer vision tasks. In this paper, we +explore the use of CLIP (Contrastive Language-Image Pretraining), a +vision-language model pretrained on large-scale image-text pairs to align +visual and textual features, for acquiring fine-grained and domain-invariant +representations in generalizable person re-identification. The adaptation of +CLIP to the task presents two primary challenges: learning more fine-grained +features to enhance discriminative ability, and learning more domain-invariant +features to improve the model's generalization capabilities. To mitigate the +first challenge thereby enhance the ability to learn fine-grained features, a +three-stage strategy is proposed to boost the accuracy of text descriptions. +Initially, the image encoder is trained to effectively adapt to person +re-identification tasks. In the second stage, the features extracted by the +image encoder are used to generate textual descriptions (i.e., prompts) for +each image. Finally, the text encoder with the learned prompts is employed to +guide the training of the final image encoder. To enhance the model's +generalization capabilities to unseen domains, a bidirectional guiding method +is introduced to learn domain-invariant image features. Specifically, +domain-invariant and domain-relevant prompts are generated, and both positive +(pulling together image features and domain-invariant prompts) and negative +(pushing apart image features and domain-relevant prompts) views are used to +train the image encoder. Collectively, these strategies contribute to the +development of an innovative CLIP-based framework for learning fine-grained +generalized features in person re-identification. + +
+
+ comment: Accepted by IEEE TIFS +
+
+
+
+
+ + ☆ Addressing Out-of-Label Hazard Detection in Dashcam Videos: Insights + from the COOOL Challenge + + +
+ This paper presents a novel approach for hazard analysis in dashcam footage, +addressing the detection of driver reactions to hazards, the identification of +hazardous objects, and the generation of descriptive captions. We first +introduce a method for detecting driver reactions through speed and sound +anomaly detection, leveraging unsupervised learning techniques. For hazard +detection, we employ a set of heuristic rules as weak classifiers, which are +combined using an ensemble method. This ensemble approach is further refined +with differential privacy to mitigate overconfidence, ensuring robustness +despite the lack of labeled data. Lastly, we use state-of-the-art +vision-language models for hazard captioning, generating descriptive labels for +the detected hazards. Our method achieved the highest scores in the Challenge +on Out-of-Label in Autonomous Driving, demonstrating its effectiveness across +all three tasks. Source codes are publicly available at +https://github.com/ffyyytt/COOOL_2025. + +
+
+ comment: 5 pages, WACV 2025 +
+
+
+
+
+ + ☆ Freestyle Sketch-in-the-Loop Image Segmentation + + +
+ In this paper, we expand the domain of sketch research into the field of +image segmentation, aiming to establish freehand sketches as a query modality +for subjective image segmentation. Our innovative approach introduces a +"sketch-in-the-loop" image segmentation framework, enabling the segmentation of +visual concepts partially, completely, or in groupings - a truly "freestyle" +approach - without the need for a purpose-made dataset (i.e., mask-free). This +framework capitalises on the synergy between sketch-based image retrieval +(SBIR) models and large-scale pre-trained models (CLIP or DINOv2). The former +provides an effective training signal, while fine-tuned versions of the latter +execute the subjective segmentation. Additionally, our purpose-made +augmentation strategy enhances the versatility of our sketch-guided mask +generation, allowing segmentation at multiple granularity levels. Extensive +evaluations across diverse benchmark datasets underscore the superior +performance of our method in comparison to existing approaches across various +evaluation scenarios. + +
+
+
+
+
+ + ☆ Improving Tropical Cyclone Forecasting With Video Diffusion Models + + +
+ Tropical cyclone (TC) forecasting is crucial for disaster preparedness and +mitigation. While recent deep learning approaches have shown promise, existing +methods often treat TC evolution as a series of independent frame-to-frame +predictions, limiting their ability to capture long-term dynamics. We present a +novel application of video diffusion models for TC forecasting that explicitly +models temporal dependencies through additional temporal layers. Our approach +enables the model to generate multiple frames simultaneously, better capturing +cyclone evolution patterns. We introduce a two-stage training strategy that +significantly improves individual-frame quality and performance in low-data +regimes. Experimental results show our method outperforms the previous approach +of Nath et al. by 19.3% in MAE, 16.2% in PSNR, and 36.1% in SSIM. Most notably, +we extend the reliable forecasting horizon from 36 to 50 hours. Through +comprehensive evaluation using both traditional metrics and Fr\'echet Video +Distance (FVD), we demonstrate that our approach produces more temporally +coherent forecasts while maintaining competitive single-frame quality. Code +accessible at https://github.com/Ren-creater/forecast-video-diffmodels. + +
+
+ comment: 7 pages, 7 figures +
+
+
+
+
+ + ☆ Controllable Forgetting Mechanism for Few-Shot Class-Incremental + Learning + + +
+ Class-incremental learning in the context of limited personal labeled samples +(few-shot) is critical for numerous real-world applications, such as smart home +devices. A key challenge in these scenarios is balancing the trade-off between +adapting to new, personalized classes and maintaining the performance of the +model on the original, base classes. Fine-tuning the model on novel classes +often leads to the phenomenon of catastrophic forgetting, where the accuracy of +base classes declines unpredictably and significantly. In this paper, we +propose a simple yet effective mechanism to address this challenge by +controlling the trade-off between novel and base class accuracy. We +specifically target the ultra-low-shot scenario, where only a single example is +available per novel class. Our approach introduces a Novel Class Detection +(NCD) rule, which adjusts the degree of forgetting a priori while +simultaneously enhancing performance on novel classes. We demonstrate the +versatility of our solution by applying it to state-of-the-art Few-Shot +Class-Incremental Learning (FSCIL) methods, showing consistent improvements +across different settings. To better quantify the trade-off between novel and +base class performance, we introduce new metrics: NCR@2FOR and NCR@5FOR. Our +approach achieves up to a 30% improvement in novel class accuracy on the +CIFAR100 dataset (1-shot, 1 novel class) while maintaining a controlled base +class forgetting rate of 2%. + +
+
+ comment: ICASSP 2025 +
+
+
+
+
+ + ☆ Real-Time Brain Tumor Detection in Intraoperative Ultrasound Using + YOLO11: From Model Training to Deployment in the Operating Room + + +
+ Intraoperative ultrasound (ioUS) is a valuable tool in brain tumor surgery +due to its versatility, affordability, and seamless integration into the +surgical workflow. However, its adoption remains limited, primarily because of +the challenges associated with image interpretation and the steep learning +curve required for effective use. This study aimed to enhance the +interpretability of ioUS images by developing a real-time brain tumor detection +system deployable in the operating room. We collected 2D ioUS images from the +Brain Tumor Intraoperative Database (BraTioUS) and the public ReMIND dataset, +annotated with expert-refined tumor labels. Using the YOLO11 architecture and +its variants, we trained object detection models to identify brain tumors. The +dataset included 1,732 images from 192 patients, divided into training, +validation, and test sets. Data augmentation expanded the training set to +11,570 images. In the test dataset, YOLO11s achieved the best balance of +precision and computational efficiency, with a mAP@50 of 0.95, mAP@50-95 of +0.65, and a processing speed of 34.16 frames per second. The proposed solution +was prospectively validated in a cohort of 15 consecutively operated patients +diagnosed with brain tumors. Neurosurgeons confirmed its seamless integration +into the surgical workflow, with real-time predictions accurately delineating +tumor regions. These findings highlight the potential of real-time object +detection algorithms to enhance ioUS-guided brain tumor surgery, addressing key +challenges in interpretation and providing a foundation for future development +of computer vision-based tools for neuro-oncological surgery. + +
+
+
+
+
+ + ☆ MatCLIP: Light- and Shape-Insensitive Assignment of PBR Material Models + + +
+ Assigning realistic materials to 3D models remains a significant challenge in +computer graphics. We propose MatCLIP, a novel method that extracts shape- and +lighting-insensitive descriptors of Physically Based Rendering (PBR) materials +to assign plausible textures to 3D objects based on images, such as the output +of Latent Diffusion Models (LDMs) or photographs. Matching PBR materials to +static images is challenging because the PBR representation captures the +dynamic appearance of materials under varying viewing angles, shapes, and +lighting conditions. By extending an Alpha-CLIP-based model on material +renderings across diverse shapes and lighting, and encoding multiple viewing +conditions for PBR materials, our approach generates descriptors that bridge +the domains of PBR representations with photographs or renderings, including +LDM outputs. This enables consistent material assignments without requiring +explicit knowledge of material relationships between different parts of an +object. MatCLIP achieves a top-1 classification accuracy of 76.6%, +outperforming state-of-the-art methods such as PhotoShape and MatAtlas by over +15 percentage points on publicly available datasets. Our method can be used to +construct material assignments for 3D shape datasets such as ShapeNet, +3DCoMPaT++, and Objaverse. All code and data will be released. + +
+
+ comment: Preprint, 10 pages +
+
+
+
+
+ + ☆ Evaluating Data Influence in Meta Learning + + +
+ As one of the most fundamental models, meta learning aims to effectively +address few-shot learning challenges. However, it still faces significant +issues related to the training data, such as training inefficiencies due to +numerous low-contribution tasks in large datasets and substantial noise from +incorrect labels. Thus, training data attribution methods are needed for meta +learning. However, the dual-layer structure of mata learning complicates the +modeling of training data contributions because of the interdependent influence +between meta-parameters and task-specific parameters, making existing data +influence evaluation tools inapplicable or inaccurate. To address these +challenges, based on the influence function, we propose a general data +attribution evaluation framework for meta-learning within the bilevel +optimization framework. Our approach introduces task influence functions +(task-IF) and instance influence functions (instance-IF) to accurately assess +the impact of specific tasks and individual data points in closed forms. This +framework comprehensively models data contributions across both the inner and +outer training processes, capturing the direct effects of data points on +meta-parameters as well as their indirect influence through task-specific +parameters. We also provide several strategies to enhance computational +efficiency and scalability. Experimental results demonstrate the framework's +effectiveness in training data evaluation via several downstream tasks. + +
+
+
+
+
+ + ☆ Rethinking the Bias of Foundation Model under Long-tailed Distribution + + +
+ Long-tailed learning has garnered increasing attention due to its practical +significance. Among the various approaches, the fine-tuning paradigm has gained +considerable interest with the advent of foundation models. However, most +existing methods primarily focus on leveraging knowledge from these models, +overlooking the inherent biases introduced by the imbalanced training data they +rely on. In this paper, we examine how such imbalances from pre-training affect +long-tailed downstream tasks. Specifically, we find the imbalance biases +inherited in foundation models on downstream task as parameter imbalance and +data imbalance. During fine-tuning, we observe that parameter imbalance plays a +more critical role, while data imbalance can be mitigated using existing +re-balancing strategies. Moreover, we find that parameter imbalance cannot be +effectively addressed by current re-balancing techniques, such as adjusting the +logits, during training, unlike data imbalance. To tackle both imbalances +simultaneously, we build our method on causal learning and view the incomplete +semantic factor as the confounder, which brings spurious correlations between +input samples and labels. To resolve the negative effects of this, we propose a +novel backdoor adjustment method that learns the true causal effect between +input samples and labels, rather than merely fitting the correlations in the +data. Notably, we achieve an average performance increase of about $1.67\%$ on +each dataset. + +
+
+
+
+
+ + ☆ Understanding Long Videos via LLM-Powered Entity Relation Graphs + + +
+ The analysis of extended video content poses unique challenges in artificial +intelligence, particularly when dealing with the complexity of tracking and +understanding visual elements across time. Current methodologies that process +video frames sequentially struggle to maintain coherent tracking of objects, +especially when these objects temporarily vanish and later reappear in the +footage. A critical limitation of these approaches is their inability to +effectively identify crucial moments in the video, largely due to their limited +grasp of temporal relationships. To overcome these obstacles, we present +GraphVideoAgent, a cutting-edge system that leverages the power of graph-based +object tracking in conjunction with large language model capabilities. At its +core, our framework employs a dynamic graph structure that maps and monitors +the evolving relationships between visual entities throughout the video +sequence. This innovative approach enables more nuanced understanding of how +objects interact and transform over time, facilitating improved frame selection +through comprehensive contextual awareness. Our approach demonstrates +remarkable effectiveness when tested against industry benchmarks. In +evaluations on the EgoSchema dataset, GraphVideoAgent achieved a 2.2 +improvement over existing methods while requiring analysis of only 8.2 frames +on average. Similarly, testing on the NExT-QA benchmark yielded a 2.0 +performance increase with an average frame requirement of 8.1. These results +underscore the efficiency of our graph-guided methodology in enhancing both +accuracy and computational performance in long-form video understanding tasks. + +
+
+
+
+
+ + Any2AnyTryon: Leveraging Adaptive Position Embeddings for Versatile + Virtual Clothing Tasks + + +
+ Image-based virtual try-on (VTON) aims to generate a virtual try-on result by +transferring an input garment onto a target person's image. However, the +scarcity of paired garment-model data makes it challenging for existing methods +to achieve high generalization and quality in VTON. Also, it limits the ability +to generate mask-free try-ons. To tackle the data scarcity problem, approaches +such as Stable Garment and MMTryon use a synthetic data strategy, effectively +increasing the amount of paired data on the model side. However, existing +methods are typically limited to performing specific try-on tasks and lack +user-friendliness. To enhance the generalization and controllability of VTON +generation, we propose Any2AnyTryon, which can generate try-on results based on +different textual instructions and model garment images to meet various needs, +eliminating the reliance on masks, poses, or other conditions. Specifically, we +first construct the virtual try-on dataset LAION-Garment, the largest known +open-source garment try-on dataset. Then, we introduce adaptive position +embedding, which enables the model to generate satisfactory outfitted model +images or garment images based on input images of different sizes and +categories, significantly enhancing the generalization and controllability of +VTON generation. In our experiments, we demonstrate the effectiveness of our +Any2AnyTryon and compare it with existing methods. The results show that +Any2AnyTryon enables flexible, controllable, and high-quality image-based +virtual try-on generation.https://logn-2024.github.io/Any2anyTryonProjectPage/ + +
+
+ comment: 13 pages,13 figures +
+
+
+
+
+ + ☆ A Data-Centric Approach: Dimensions of Visual Complexity and How to find + Them + + +
+ Understanding how humans perceive visual complexity is a key area of study in +visual cognition. Previous approaches to modeling visual complexity have often +resulted in intricate, difficult-to-interpret solutions that employ numerous +features or sophisticated deep learning architectures. While these complex +models achieve high performance on specific datasets, they often sacrifice +interpretability, making it challenging to understand the factors driving human +perception of complexity. A recent model based on image segmentations showed +promise in addressing this challenge; however, it presented limitations in +capturing structural and semantic aspects of visual complexity. In this paper, +we propose viable and effective features to overcome these shortcomings. +Specifically, we develop multiscale features for the structural aspect of +complexity, including the Multiscale Sobel Gradient (MSG), which captures +spatial intensity variations across scales, and Multiscale Unique Colors (MUC), +which quantifies image colorfulness by indexing quantized RGB values. We also +introduce a new dataset SVG based on Visual Genome to explore the semantic +aspect of visual complexity, obtaining surprise scores based on the element of +surprise in images, which we demonstrate significantly contributes to perceived +complexity. Overall, we suggest that the nature of the data is fundamental to +understanding and modeling visual complexity, highlighting the importance of +both structural and semantic dimensions in providing a comprehensive, +interpretable assessment. The code for our analysis, experimental setup, and +dataset will be made publicly available upon acceptance. + +
+
+
+
+
+ + ☆ Slot-Guided Adaptation of Pre-trained Diffusion Models for + Object-Centric Learning and Compositional Generation ICLR2025 + + +
+ We present SlotAdapt, an object-centric learning method that combines slot +attention with pretrained diffusion models by introducing adapters for +slot-based conditioning. Our method preserves the generative power of +pretrained diffusion models, while avoiding their text-centric conditioning +bias. We also incorporate an additional guidance loss into our architecture to +align cross-attention from adapter layers with slot attention. This enhances +the alignment of our model with the objects in the input image without using +external supervision. Experimental results show that our method outperforms +state-of-the-art techniques in object discovery and image generation tasks +across multiple datasets, including those with real images. Furthermore, we +demonstrate through experiments that our method performs remarkably well on +complex real-world images for compositional generation, in contrast to other +slot-based generative methods in the literature. The project page can be found +at $\href{https://kaanakan.github.io/SlotAdapt/}{\text{this https url}}$. + +
+
+ comment: Accepted to ICLR2025. + $\href{https://kaanakan.github.io/SlotAdapt/}{\text{Project Page}}$ +
+
+
+
+
+ + ☆ D-PLS: Decoupled Semantic Segmentation for + 4D-Panoptic-LiDAR-Segmentation + + +
+ This paper introduces a novel approach to 4D Panoptic LiDAR Segmentation that +decouples semantic and instance segmentation, leveraging single-scan semantic +predictions as prior information for instance segmentation. Our method D-PLS +first performs single-scan semantic segmentation and aggregates the results +over time, using them to guide instance segmentation. The modular design of +D-PLS allows for seamless integration on top of any semantic segmentation +architecture, without requiring architectural changes or retraining. We +evaluate our approach on the SemanticKITTI dataset, where it demonstrates +significant improvements over the baseline in both classification and +association tasks, as measured by the LiDAR Segmentation and Tracking Quality +(LSTQ) metric. Furthermore, we show that our decoupled architecture not only +enhances instance prediction but also surpasses the baseline due to +advancements in single-scan semantic segmentation. + +
+
+
+
+
+ + ☆ The Components of Collaborative Joint Perception and Prediction -- A + Conceptual Framework + + +
+ Connected Autonomous Vehicles (CAVs) benefit from Vehicle-to-Everything (V2X) +communication, which enables the exchange of sensor data to achieve +Collaborative Perception (CP). To reduce cumulative errors in perception +modules and mitigate the visual occlusion, this paper introduces a new task, +Collaborative Joint Perception and Prediction (Co-P&P), and provides a +conceptual framework for its implementation to improve motion prediction of +surrounding objects, thereby enhancing vehicle awareness in complex traffic +scenarios. The framework consists of two decoupled core modules, Collaborative +Scene Completion (CSC) and Joint Perception and Prediction (P&P) module, which +simplify practical deployment and enhance scalability. Additionally, we outline +the challenges in Co-P&P and discuss future directions for this research area. + +
+
+ comment: 8 pages, 4 figures, accepted by conference VEHITS2025 +
+
+
+
+
+ + ☆ CausalSR: Structural Causal Model-Driven Super-Resolution with + Counterfactual Inference + + +
+ Physical and optical factors interacting with sensor characteristics create +complex image degradation patterns. Despite advances in deep learning-based +super-resolution, existing methods overlook the causal nature of degradation by +adopting simplistic black-box mappings. This paper formulates super-resolution +using structural causal models to reason about image degradation processes. We +establish a mathematical foundation that unifies principles from causal +inference, deriving necessary conditions for identifying latent degradation +mechanisms and corresponding propagation. We propose a novel counterfactual +learning strategy that leverages semantic guidance to reason about hypothetical +degradation scenarios, leading to theoretically-grounded representations that +capture invariant features across different degradation conditions. The +framework incorporates an adaptive intervention mechanism with provable bounds +on treatment effects, allowing precise manipulation of degradation factors +while maintaining semantic consistency. Through extensive empirical validation, +we demonstrate that our approach achieves significant improvements over +state-of-the-art methods, particularly in challenging scenarios with compound +degradations. On standard benchmarks, our method consistently outperforms +existing approaches by significant margins (0.86-1.21dB PSNR), while providing +interpretable insights into the restoration process. The theoretical framework +and empirical results demonstrate the fundamental importance of causal +reasoning in understanding image restoration systems. + +
+
+
+
+
+ + ☆ Can Location Embeddings Enhance Super-Resolution of Satellite Imagery? + + +
+ Publicly available satellite imagery, such as Sentinel- 2, often lacks the +spatial resolution required for accurate analysis of remote sensing tasks +including urban planning and disaster response. Current super-resolution +techniques are typically trained on limited datasets, leading to poor +generalization across diverse geographic regions. In this work, we propose a +novel super-resolution framework that enhances generalization by incorporating +geographic context through location embeddings. Our framework employs +Generative Adversarial Networks (GANs) and incorporates techniques from +diffusion models to enhance image quality. Furthermore, we address tiling +artifacts by integrating information from neighboring images, enabling the +generation of seamless, high-resolution outputs. We demonstrate the +effectiveness of our method on the building segmentation task, showing +significant improvements over state-of-the-art methods and highlighting its +potential for real-world applications. + +
+
+ comment: Accepted to IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) +
+
+
+
+
+ + ☆ Controllable Hand Grasp Generation for HOI and Efficient Evaluation + Methods + + +
+ Controllable affordance Hand-Object Interaction (HOI) generation has become +an increasingly important area of research in computer vision. In HOI +generation, the hand grasp generation is a crucial step for effectively +controlling the geometry of the hand. Current hand grasp generation methods +rely on 3D information for both the hand and the object. In addition, these +methods lack controllability concerning the hand's location and orientation. We +treat the hand pose as the discrete graph structure and exploit the geometric +priors. It is well established that higher order contextual dependency among +the points improves the quality of the results in general. We propose a +framework of higher order geometric representations (HOR's) inspired by +spectral graph theory and vector algebra to improve the quality of generated +hand poses. We demonstrate the effectiveness of our proposed HOR's in devising +a controllable novel diffusion method (based on 2D information) for hand grasp +generation that outperforms the state of the art (SOTA). Overcoming the +limitations of existing methods: like lacking of controllability and dependency +on 3D information. Once we have the generated pose, it is very natural to +evaluate them using a metric. Popular metrics like FID and MMD are biased and +inefficient for evaluating the generated hand poses. Using our proposed HOR's, +we introduce an efficient and stable framework of evaluation metrics for grasp +generation methods, addressing inefficiencies and biases in FID and MMD. + +
+
+
+
+
+ + ☆ Pfungst and Clever Hans: Identifying the unintended cues in a widely + used Alzheimer's disease MRI dataset using explainable deep learning + + +
+ Backgrounds. + Deep neural networks have demonstrated high accuracy in classifying +Alzheimer's disease (AD). This study aims to enlighten the underlying black-box +nature and reveal individual contributions of T1-weighted (T1w) gray-white +matter texture, volumetric information and preprocessing on classification +performance. + Methods. + We utilized T1w MRI data from the Alzheimer's Disease Neuroimaging Initiative +to distinguish matched AD patients (990 MRIs) from healthy controls (990 MRIs). +Preprocessing included skull stripping and binarization at varying thresholds +to systematically eliminate texture information. A deep neural network was +trained on these configurations, and the model performance was compared using +McNemar tests with discrete Bonferroni-Holm correction. Layer-wise Relevance +Propagation (LRP) and structural similarity metrics between heatmaps were +applied to analyze learned features. + Results. + Classification performance metrics (accuracy, sensitivity, and specificity) +were comparable across all configurations, indicating a negligible influence of +T1w gray- and white signal texture. Models trained on binarized images +demonstrated similar feature performance and relevance distributions, with +volumetric features such as atrophy and skull-stripping features emerging as +primary contributors. + Conclusions. + We revealed a previously undiscovered Clever Hans effect in a widely used AD +MRI dataset. Deep neural networks classification predominantly rely on +volumetric features, while eliminating gray-white matter T1w texture did not +decrease the performance. This study clearly demonstrates an overestimation of +the importance of gray-white matter contrasts, at least for widely used +structural T1w images, and highlights potential misinterpretation of +performance metrics. + +
+
+
+
+
+ + ☆ ClearSight: Human Vision-Inspired Solutions for Event-Based Motion + Deblurring + + +
+ Motion deblurring addresses the challenge of image blur caused by camera or +scene movement. Event cameras provide motion information that is encoded in the +asynchronous event streams. To efficiently leverage the temporal information of +event streams, we employ Spiking Neural Networks (SNNs) for motion feature +extraction and Artificial Neural Networks (ANNs) for color information +processing. Due to the non-uniform distribution and inherent redundancy of +event data, existing cross-modal feature fusion methods exhibit certain +limitations. Inspired by the visual attention mechanism in the human visual +system, this study introduces a bioinspired dual-drive hybrid network (BDHNet). +Specifically, the Neuron Configurator Module (NCM) is designed to dynamically +adjusts neuron configurations based on cross-modal features, thereby focusing +the spikes in blurry regions and adapting to varying blurry scenarios +dynamically. Additionally, the Region of Blurry Attention Module (RBAM) is +introduced to generate a blurry mask in an unsupervised manner, effectively +extracting motion clues from the event features and guiding more accurate +cross-modal feature fusion. Extensive subjective and objective evaluations +demonstrate that our method outperforms current state-of-the-art methods on +both synthetic and real-world datasets. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ☆ MM-Retinal V2: Transfer an Elite Knowledge Spark into Fundus + Vision-Language Pretraining + + +
+ Vision-language pretraining (VLP) has been investigated to generalize across +diverse downstream tasks for fundus image analysis. Although recent methods +showcase promising achievements, they significantly rely on large-scale private +image-text data but pay less attention to the pretraining manner, which limits +their further advancements. In this work, we introduce MM-Retinal V2, a +high-quality image-text paired dataset comprising CFP, FFA, and OCT image +modalities. Then, we propose a novel fundus vision-language pretraining model, +namely KeepFIT V2, which is pretrained by integrating knowledge from the elite +data spark into categorical public datasets. Specifically, a preliminary +textual pretraining is adopted to equip the text encoder with primarily +ophthalmic textual knowledge. Moreover, a hybrid image-text knowledge injection +module is designed for knowledge transfer, which is essentially based on a +combination of global semantic concepts from contrastive learning and local +appearance details from generative learning. Extensive experiments across +zero-shot, few-shot, and linear probing settings highlight the generalization +and transferability of KeepFIT V2, delivering performance competitive to +state-of-the-art fundus VLP models trained on large-scale private image-text +datasets. Our dataset and model are publicly available via +https://github.com/lxirich/MM-Retinal. + +
+
+
+
+
+ + ☆ Can Multimodal Large Language Models be Guided to Improve Industrial + Anomaly Detection? + + +
+ In industrial settings, the accurate detection of anomalies is essential for +maintaining product quality and ensuring operational safety. Traditional +industrial anomaly detection (IAD) models often struggle with flexibility and +adaptability, especially in dynamic production environments where new defect +types and operational changes frequently arise. Recent advancements in +Multimodal Large Language Models (MLLMs) hold promise for overcoming these +limitations by combining visual and textual information processing +capabilities. MLLMs excel in general visual understanding due to their training +on large, diverse datasets, but they lack domain-specific knowledge, such as +industry-specific defect tolerance levels, which limits their effectiveness in +IAD tasks. To address these challenges, we propose Echo, a novel multi-expert +framework designed to enhance MLLM performance for IAD. Echo integrates four +expert modules: Reference Extractor which provides a contextual baseline by +retrieving similar normal images, Knowledge Guide which supplies +domain-specific insights, Reasoning Expert which enables structured, stepwise +reasoning for complex queries, and Decision Maker which synthesizes information +from all modules to deliver precise, context-aware responses. Evaluated on the +MMAD benchmark, Echo demonstrates significant improvements in adaptability, +precision, and robustness, moving closer to meeting the demands of real-world +industrial anomaly detection. + +
+
+ comment: 16 pages, 11 figures +
+
+
+
+
+ + ☆ Do Existing Testing Tools Really Uncover Gender Bias in Text-to-Image + Models? + + +
+ Text-to-Image (T2I) models have recently gained significant attention due to +their ability to generate high-quality images and are consequently used in a +wide range of applications. However, there are concerns about the gender bias +of these models. Previous studies have shown that T2I models can perpetuate or +even amplify gender stereotypes when provided with neutral text prompts. +Researchers have proposed automated gender bias uncovering detectors for T2I +models, but a crucial gap exists: no existing work comprehensively compares the +various detectors and understands how the gender bias detected by them deviates +from the actual situation. This study addresses this gap by validating previous +gender bias detectors using a manually labeled dataset and comparing how the +bias identified by various detectors deviates from the actual bias in T2I +models, as verified by manual confirmation. We create a dataset consisting of +6,000 images generated from three cutting-edge T2I models: Stable Diffusion XL, +Stable Diffusion 3, and Dreamlike Photoreal 2.0. During the human-labeling +process, we find that all three T2I models generate a portion (12.48% on +average) of low-quality images (e.g., generate images with no face present), +where human annotators cannot determine the gender of the person. Our analysis +reveals that all three T2I models show a preference for generating male images, +with SDXL being the most biased. Additionally, images generated using prompts +containing professional descriptions (e.g., lawyer or doctor) show the most +bias. We evaluate seven gender bias detectors and find that none fully capture +the actual level of bias in T2I models, with some detectors overestimating bias +by up to 26.95%. We further investigate the causes of inaccurate estimations, +highlighting the limitations of detectors in dealing with low-quality images. +Based on our findings, we propose an enhanced detector... + +
+
+
+
+
+ + ☆ Efficient Attention-Sharing Information Distillation Transformer for + Lightweight Single Image Super-Resolution + + +
+ Transformer-based Super-Resolution (SR) methods have demonstrated superior +performance compared to convolutional neural network (CNN)-based SR approaches +due to their capability to capture long-range dependencies. However, their high +computational complexity necessitates the development of lightweight approaches +for practical use. To address this challenge, we propose the Attention-Sharing +Information Distillation (ASID) network, a lightweight SR network that +integrates attention-sharing and an information distillation structure +specifically designed for Transformer-based SR methods. We modify the +information distillation scheme, originally designed for efficient CNN +operations, to reduce the computational load of stacked self-attention layers, +effectively addressing the efficiency bottleneck. Additionally, we introduce +attention-sharing across blocks to further minimize the computational cost of +self-attention operations. By combining these strategies, ASID achieves +competitive performance with existing SR methods while requiring only around +300K parameters - significantly fewer than existing CNN-based and +Transformer-based SR models. Furthermore, ASID outperforms state-of-the-art SR +methods when the number of parameters is matched, demonstrating its efficiency +and effectiveness. The code and supplementary material are available on the +project page. + +
+
+ comment: Published at AAAI 2025, for project page, see + https://github.com/saturnian77/ASID +
+
+
+
+
+ + ☆ NanoHTNet: Nano Human Topology Network for Efficient 3D Human Pose + Estimation + + +
+ The widespread application of 3D human pose estimation (HPE) is limited by +resource-constrained edge devices, requiring more efficient models. A key +approach to enhancing efficiency involves designing networks based on the +structural characteristics of input data. However, effectively utilizing the +structural priors in human skeletal inputs remains challenging. To address +this, we leverage both explicit and implicit spatio-temporal priors of the +human body through innovative model design and a pre-training proxy task. +First, we propose a Nano Human Topology Network (NanoHTNet), a tiny 3D HPE +network with stacked Hierarchical Mixers to capture explicit features. +Specifically, the spatial Hierarchical Mixer efficiently learns the human +physical topology across multiple semantic levels, while the temporal +Hierarchical Mixer with discrete cosine transform and low-pass filtering +captures local instantaneous movements and global action coherence. Moreover, +Efficient Temporal-Spatial Tokenization (ETST) is introduced to enhance +spatio-temporal interaction and reduce computational complexity significantly. +Second, PoseCLR is proposed as a general pre-training method based on +contrastive learning for 3D HPE, aimed at extracting implicit representations +of human topology. By aligning 2D poses from diverse viewpoints in the proxy +task, PoseCLR aids 3D HPE encoders like NanoHTNet in more effectively capturing +the high-dimensional features of the human body, leading to further performance +improvements. Extensive experiments verify that NanoHTNet with PoseCLR +outperforms other state-of-the-art methods in efficiency, making it ideal for +deployment on edge devices like the Jetson Nano. Code and models are available +at https://github.com/vefalun/NanoHTNet. + +
+
+
+
+
+ + ☆ Efficiency Bottlenecks of Convolutional Kolmogorov-Arnold Networks: A + Comprehensive Scrutiny with ImageNet, AlexNet, LeNet and Tabular + Classification + + +
+ Algorithmic level developments like Convolutional Neural Networks, +transformers, attention mechanism, Retrieval Augmented Generation and so on +have changed Artificial Intelligence. Recent such development was observed by +Kolmogorov-Arnold Networks that suggested to challenge the fundamental concept +of a Neural Network, thus change Multilayer Perceptron, and Convolutional +Neural Networks. They received a good reception in terms of scientific +modeling, yet had some drawbacks in terms of efficiency. In this paper, we +train Convolutional Kolmogorov Arnold Networks (CKANs) with the ImageNet-1k +dataset with 1.3 million images, MNIST dataset with 60k images and a tabular +biological science related MoA dataset and test the promise of CKANs in terms +of FLOPS, Inference Time, number of trainable parameters and training time +against the accuracy, precision, recall and f-1 score they produce against the +standard industry practice on CNN models. We show that the CKANs perform fair +yet slower than CNNs in small size dataset like MoA and MNIST but are not +nearly comparable as the dataset gets larger and more complex like the +ImageNet. The code implementation of this paper can be found on the link: +\href{https://github.com/ashimdahal/Study-of-Convolutional-Kolmogorov-Arnold-networks}{https://github.com/ashimdahal/Study-of-Convolutional-Kolmogorov-Arnold-networks} + +
+
+
+
+
+ + ☆ Z-Stack Scanning can Improve AI Detection of Mitosis: A Case Study of + Meningiomas + + +
+ Z-stack scanning is an emerging whole slide imaging technology that captures +multiple focal planes alongside the z-axis of a glass slide. Because z-stacking +can offer enhanced depth information compared to the single-layer whole slide +imaging, this technology can be particularly useful in analyzing small-scaled +histopathological patterns. However, its actual clinical impact remains debated +with mixed results. To clarify this, we investigate the effect of z-stack +scanning on artificial intelligence (AI) mitosis detection of meningiomas. With +the same set of 22 Hematoxylin and Eosin meningioma glass slides scanned by +three different digital pathology scanners, we tested the performance of three +AI pipelines on both single-layer and z-stacked whole slide images (WSIs). +Results showed that in all scanner-AI combinations, z-stacked WSIs +significantly increased AI's sensitivity (+17.14%) on the mitosis detection +with only a marginal impact on precision. Our findings provide quantitative +evidence that highlights z-stack scanning as a promising technique for AI +mitosis detection, paving the way for more reliable AI-assisted pathology +workflows, which can ultimately benefit patient management. + +
+
+ comment: To appear 2025 IEEE 22nd International Symposium on Biomedical + Imaging (ISBI) +
+
+
+
+
+ + ☆ Leveraging Video Vision Transformer for Alzheimer's Disease Diagnosis + from 3D Brain MRI + + +
+ Alzheimer's disease (AD) is a neurodegenerative disorder affecting millions +worldwide, necessitating early and accurate diagnosis for optimal patient +management. In recent years, advancements in deep learning have shown +remarkable potential in medical image analysis. Methods In this study, we +present "ViTranZheimer," an AD diagnosis approach which leverages video vision +transformers to analyze 3D brain MRI data. By treating the 3D MRI volumes as +videos, we exploit the temporal dependencies between slices to capture +intricate structural relationships. The video vision transformer's +self-attention mechanisms enable the model to learn long-range dependencies and +identify subtle patterns that may indicate AD progression. Our proposed deep +learning framework seeks to enhance the accuracy and sensitivity of AD +diagnosis, empowering clinicians with a tool for early detection and +intervention. We validate the performance of the video vision transformer using +the ADNI dataset and conduct comparative analyses with other relevant models. +Results The proposed ViTranZheimer model is compared with two hybrid models, +CNN-BiLSTM and ViT-BiLSTM. CNN-BiLSTM is the combination of a convolutional +neural network (CNN) and a bidirectional long-short-term memory network +(BiLSTM), while ViT-BiLSTM is the combination of a vision transformer (ViT) +with BiLSTM. The accuracy levels achieved in the ViTranZheimer, CNN-BiLSTM, and +ViT-BiLSTM models are 98.6%, 96.479%, and 97.465%, respectively. ViTranZheimer +demonstrated the highest accuracy at 98.6%, outperforming other models in this +evaluation metric, indicating its superior performance in this specific +evaluation metric. Conclusion This research advances the understanding of +applying deep learning techniques in neuroimaging and Alzheimer's disease +research, paving the way for earlier and less invasive clinical diagnosis. + +
+
+
+
+
+ + ☆ A Survey on Computational Pathology Foundation Models: Datasets, + Adaptation Strategies, and Evaluation Tasks + + +
+ Computational pathology foundation models (CPathFMs) have emerged as a +powerful approach for analyzing histopathological data, leveraging +self-supervised learning to extract robust feature representations from +unlabeled whole-slide images. These models, categorized into uni-modal and +multi-modal frameworks, have demonstrated promise in automating complex +pathology tasks such as segmentation, classification, and biomarker discovery. +However, the development of CPathFMs presents significant challenges, such as +limited data accessibility, high variability across datasets, the necessity for +domain-specific adaptation, and the lack of standardized evaluation benchmarks. +This survey provides a comprehensive review of CPathFMs in computational +pathology, focusing on datasets, adaptation strategies, and evaluation tasks. +We analyze key techniques, such as contrastive learning and multi-modal +integration, and highlight existing gaps in current research. Finally, we +explore future directions from four perspectives for advancing CPathFMs. This +survey serves as a valuable resource for researchers, clinicians, and AI +practitioners, guiding the advancement of CPathFMs toward robust and clinically +applicable AI-driven pathology solutions. + +
+
+
+
+
+ + ☆ SeqSeg: Learning Local Segments for Automatic Vascular Model + Construction + + +
+ Computational modeling of cardiovascular function has become a critical part +of diagnosing, treating and understanding cardiovascular disease. Most +strategies involve constructing anatomically accurate computer models of +cardiovascular structures, which is a multistep, time-consuming process. To +improve the model generation process, we herein present SeqSeg (sequential +segmentation): a novel deep learning based automatic tracing and segmentation +algorithm for constructing image-based vascular models. SeqSeg leverages local +U-Net-based inference to sequentially segment vascular structures from medical +image volumes. We tested SeqSeg on CT and MR images of aortic and aortofemoral +models and compared the predictions to those of benchmark 2D and 3D global +nnU-Net models, which have previously shown excellent accuracy for medical +image segmentation. We demonstrate that SeqSeg is able to segment more complete +vasculature and is able to generalize to vascular structures not annotated in +the training data. + +
+
+ comment: 32 pages, 12 figures. Ann Biomed Eng (2024) +
+
+
+
+
+ + ☆ Directing Mamba to Complex Textures: An Efficient Texture-Aware State + Space Model for Image Restoration + + +
+ Image restoration aims to recover details and enhance contrast in degraded +images. With the growing demand for high-quality imaging (\textit{e.g.}, 4K and +8K), achieving a balance between restoration quality and computational +efficiency has become increasingly critical. Existing methods, primarily based +on CNNs, Transformers, or their hybrid approaches, apply uniform deep +representation extraction across the image. However, these methods often +struggle to effectively model long-range dependencies and largely overlook the +spatial characteristics of image degradation (regions with richer textures tend +to suffer more severe damage), making it hard to achieve the best trade-off +between restoration quality and efficiency. To address these issues, we propose +a novel texture-aware image restoration method, TAMambaIR, which simultaneously +perceives image textures and achieves a trade-off between performance and +efficiency. Specifically, we introduce a novel Texture-Aware State Space Model, +which enhances texture awareness and improves efficiency by modulating the +transition matrix of the state-space equation and focusing on regions with +complex textures. Additionally, we design a {Multi-Directional Perception +Block} to improve multi-directional receptive fields while maintaining low +computational overhead. Extensive experiments on benchmarks for image +super-resolution, deraining, and low-light image enhancement demonstrate that +TAMambaIR achieves state-of-the-art performance with significantly improved +efficiency, establishing it as a robust and efficient framework for image +restoration. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Efficient Object Detection of Marine Debris using Pruned YOLO Model + + +
+ Marine debris poses significant harm to marine life due to substances like +microplastics, polychlorinated biphenyls, and pesticides, which damage habitats +and poison organisms. Human-based solutions, such as diving, are increasingly +ineffective in addressing this issue. Autonomous underwater vehicles (AUVs) are +being developed for efficient sea garbage collection, with the choice of object +detection architecture being critical. This research employs the YOLOv4 model +for real-time detection of marine debris using the Trash-ICRA 19 dataset, +consisting of 7683 images at 480x320 pixels. Various modifications-pretrained +models, training from scratch, mosaic augmentation, layer freezing, +YOLOv4-tiny, and channel pruning-are compared to enhance architecture +efficiency. Channel pruning significantly improves detection speed, increasing +the base YOLOv4 frame rate from 15.19 FPS to 19.4 FPS, with only a 1.2% drop in +mean Average Precision, from 97.6% to 96.4%. + +
+
+
+
+
+ + ☆ LoRA-X: Bridging Foundation Models with Training-Free Cross-Model + Adaptation ICLR 2025 + + +
+ The rising popularity of large foundation models has led to a heightened +demand for parameter-efficient fine-tuning methods, such as Low-Rank Adaptation +(LoRA), which offer performance comparable to full model fine-tuning while +requiring only a few additional parameters tailored to the specific base model. +When such base models are deprecated and replaced, all associated LoRA modules +must be retrained, requiring access to either the original training data or a +substantial amount of synthetic data that mirrors the original distribution. +However, the original data is often inaccessible due to privacy or licensing +issues, and generating synthetic data may be impractical and insufficiently +representative. These factors complicate the fine-tuning process considerably. +To address this challenge, we introduce a new adapter, Cross-Model Low-Rank +Adaptation (LoRA-X), which enables the training-free transfer of LoRA +parameters across source and target models, eliminating the need for original +or synthetic training data. Our approach imposes the adapter to operate within +the subspace of the source base model. This constraint is necessary because our +prior knowledge of the target model is limited to its weights, and the criteria +for ensuring the adapter's transferability are restricted to the target base +model's weights and subspace. To facilitate the transfer of LoRA parameters of +the source model to a target model, we employ the adapter only in the layers of +the target model that exhibit an acceptable level of subspace similarity. Our +extensive experiments demonstrate the effectiveness of LoRA-X for text-to-image +generation, including Stable Diffusion v1.5 and Stable Diffusion XL. + +
+
+ comment: Accepted to ICLR 2025 +
+
+
+
+
+ + ☆ PackDiT: Joint Human Motion and Text Generation via Mutual Prompting + + +
+ Human motion generation has advanced markedly with the advent of diffusion +models. Most recent studies have concentrated on generating motion sequences +based on text prompts, commonly referred to as text-to-motion generation. +However, the bidirectional generation of motion and text, enabling tasks such +as motion-to-text alongside text-to-motion, has been largely unexplored. This +capability is essential for aligning diverse modalities and supports +unconditional generation. In this paper, we introduce PackDiT, the first +diffusion-based generative model capable of performing various tasks +simultaneously, including motion generation, motion prediction, text +generation, text-to-motion, motion-to-text, and joint motion-text generation. +Our core innovation leverages mutual blocks to integrate multiple diffusion +transformers (DiTs) across different modalities seamlessly. We train PackDiT on +the HumanML3D dataset, achieving state-of-the-art text-to-motion performance +with an FID score of 0.106, along with superior results in motion prediction +and in-between tasks. Our experiments further demonstrate that diffusion models +are effective for motion-to-text generation, achieving performance comparable +to that of autoregressive models. + +
+
+
+
+
+ + ☆ PhysAnimator: Physics-Guided Generative Cartoon Animation + + +
+ Creating hand-drawn animation sequences is labor-intensive and demands +professional expertise. We introduce PhysAnimator, a novel approach for +generating physically plausible meanwhile anime-stylized animation from static +anime illustrations. Our method seamlessly integrates physics-based simulations +with data-driven generative models to produce dynamic and visually compelling +animations. To capture the fluidity and exaggeration characteristic of anime, +we perform image-space deformable body simulations on extracted mesh +geometries. We enhance artistic control by introducing customizable energy +strokes and incorporating rigging point support, enabling the creation of +tailored animation effects such as wind interactions. Finally, we extract and +warp sketches from the simulation sequence, generating a texture-agnostic +representation, and employ a sketch-guided video diffusion model to synthesize +high-quality animation frames. The resulting animations exhibit temporal +consistency and visual plausibility, demonstrating the effectiveness of our +method in creating dynamic anime-style animations. + +
+
+
+
+
+ + ☆ Multi-Objective Deep-Learning-based Biomechanical Deformable Image + Registration with MOREA + + +
+ When choosing a deformable image registration (DIR) approach for images with +large deformations and content mismatch, the realism of found transformations +often needs to be traded off against the required runtime. DIR approaches using +deep learning (DL) techniques have shown remarkable promise in instantly +predicting a transformation. However, on difficult registration problems, the +realism of these transformations can fall short. DIR approaches using +biomechanical, finite element modeling (FEM) techniques can find more realistic +transformations, but tend to require much longer runtimes. This work proposes +the first hybrid approach to combine them, with the aim of getting the best of +both worlds. This hybrid approach, called DL-MOREA, combines a recently +introduced multi-objective DL-based DIR approach which leverages the VoxelMorph +framework, called DL-MODIR, with MOREA, an evolutionary algorithm-based, +multi-objective DIR approach in which a FEM-like biomechanical mesh +transformation model is used. In our proposed hybrid approach, the DL results +are used to smartly initialize MOREA, with the aim of more efficiently +optimizing its mesh transformation model. We empirically compare DL-MOREA +against its components, DL-MODIR and MOREA, on CT scan pairs capturing large +bladder filling differences of 15 cervical cancer patients. While MOREA +requires a median runtime of 45 minutes, DL-MOREA can already find high-quality +transformations after 5 minutes. Compared to the DL-MODIR transformations, the +transformations found by DL-MOREA exhibit far less folding and improve or +preserve the bladder contour distance error. + +
+
+ comment: Pre-print for the SPIE Medical Imaging: Image Processing Conference +
+
+
+
+
+ + ☆ Generating customized prompts for Zero-Shot Rare Event Medical Image + Classification using LLM + + +
+ Rare events, due to their infrequent occurrences, do not have much data, and +hence deep learning techniques fail in estimating the distribution for such +data. Open-vocabulary models represent an innovative approach to image +classification. Unlike traditional models, these models classify images into +any set of categories specified with natural language prompts during inference. +These prompts usually comprise manually crafted templates (e.g., 'a photo of a +{}') that are filled in with the names of each category. This paper introduces +a simple yet effective method for generating highly accurate and contextually +descriptive prompts containing discriminative characteristics. Rare event +detection, especially in medicine, is more challenging due to low inter-class +and high intra-class variability. To address these, we propose a novel approach +that uses domain-specific expert knowledge on rare events to generate +customized and contextually relevant prompts, which are then used by large +language models for image classification. Our zero-shot, privacy-preserving +method enhances rare event classification without additional training, +outperforming state-of-the-art techniques. + +
+
+ comment: Accepted in IEEE ISBI, 2025 +
+
+
+
+
+ + ☆ Object Detection for Medical Image Analysis: Insights from the RT-DETR + Model + + +
+ Deep learning has emerged as a transformative approach for solving complex +pattern recognition and object detection challenges. This paper focuses on the +application of a novel detection framework based on the RT-DETR model for +analyzing intricate image data, particularly in areas such as diabetic +retinopathy detection. Diabetic retinopathy, a leading cause of vision loss +globally, requires accurate and efficient image analysis to identify +early-stage lesions. The proposed RT-DETR model, built on a Transformer-based +architecture, excels at processing high-dimensional and complex visual data +with enhanced robustness and accuracy. Comparative evaluations with models such +as YOLOv5, YOLOv8, SSD, and DETR demonstrate that RT-DETR achieves superior +performance across precision, recall, mAP50, and mAP50-95 metrics, particularly +in detecting small-scale objects and densely packed targets. This study +underscores the potential of Transformer-based models like RT-DETR for +advancing object detection tasks, offering promising applications in medical +imaging and beyond. + +
+
+
+
+
+ + ☆ Cross-Domain Semantic Segmentation with Large Language Model-Assisted + Descriptor Generation + + +
+ Semantic segmentation plays a crucial role in enabling machines to understand +and interpret visual scenes at a pixel level. While traditional segmentation +methods have achieved remarkable success, their generalization to diverse +scenes and unseen object categories remains limited. Recent advancements in +large language models (LLMs) offer a promising avenue for bridging visual and +textual modalities, providing a deeper understanding of semantic relationships. +In this paper, we propose LangSeg, a novel LLM-guided semantic segmentation +method that leverages context-sensitive, fine-grained subclass descriptors +generated by LLMs. Our framework integrates these descriptors with a +pre-trained Vision Transformer (ViT) to achieve superior segmentation +performance without extensive model retraining. We evaluate LangSeg on two +challenging datasets, ADE20K and COCO-Stuff, where it outperforms +state-of-the-art models, achieving up to a 6.1% improvement in mean +Intersection over Union (mIoU). Additionally, we conduct a comprehensive +ablation study and human evaluation to validate the effectiveness of our method +in real-world scenarios. The results demonstrate that LangSeg not only excels +in semantic understanding and contextual alignment but also provides a flexible +and efficient framework for language-guided segmentation tasks. This approach +opens up new possibilities for interactive and domain-specific segmentation +applications. + +
+
+
+
+
+ + ☆ BiFold: Bimanual Cloth Folding with Language Guidance ICRA 2025 + + +
+ Cloth folding is a complex task due to the inevitable self-occlusions of +clothes, their complicated dynamics, and the disparate materials, geometries, +and textures that garments can have. In this work, we learn folding actions +conditioned on text commands. Translating high-level, abstract instructions +into precise robotic actions requires sophisticated language understanding and +manipulation capabilities. To do that, we leverage a pre-trained +vision-language model and repurpose it to predict manipulation actions. Our +model, BiFold, can take context into account and achieves state-of-the-art +performance on an existing language-conditioned folding benchmark. Given the +lack of annotated bimanual folding data, we devise a procedure to automatically +parse actions of a simulated dataset and tag them with aligned text +instructions. BiFold attains the best performance on our dataset and can +transfer to new instructions, garments, and environments. + +
+
+ comment: Accepted at ICRA 2025 +
+
+
+
+
+ + ☆ Objects matter: object-centric world models improve reinforcement + learning in visually complex environments + + +
+ Deep reinforcement learning has achieved remarkable success in learning +control policies from pixels across a wide range of tasks, yet its application +remains hindered by low sample efficiency, requiring significantly more +environment interactions than humans to reach comparable performance. +Model-based reinforcement learning (MBRL) offers a solution by leveraging +learnt world models to generate simulated experience, thereby improving sample +efficiency. However, in visually complex environments, small or dynamic +elements can be critical for decision-making. Yet, traditional MBRL methods in +pixel-based environments typically rely on auto-encoding with an $L_2$ loss, +which is dominated by large areas and often fails to capture decision-relevant +details. To address these limitations, we propose an object-centric MBRL +pipeline, which integrates recent advances in computer vision to allow agents +to focus on key decision-related elements. Our approach consists of four main +steps: (1) annotating key objects related to rewards and goals with +segmentation masks, (2) extracting object features using a pre-trained, frozen +foundation vision model, (3) incorporating these object features with the raw +observations to predict environmental dynamics, and (4) training the policy +using imagined trajectories generated by this object-centric world model. +Building on the efficient MBRL algorithm STORM, we call this pipeline OC-STORM. +We demonstrate OC-STORM's practical value in overcoming the limitations of +conventional MBRL approaches on both Atari games and the visually complex game +Hollow Knight. + +
+
+
+
+
+ + ☆ PhysBench: Benchmarking and Enhancing Vision-Language Models for + Physical World Understanding ICLR 2025 + + +
+ Understanding the physical world is a fundamental challenge in embodied AI, +critical for enabling agents to perform complex tasks and operate safely in +real-world environments. While Vision-Language Models (VLMs) have shown great +promise in reasoning and task planning for embodied agents, their ability to +comprehend physical phenomena remains extremely limited. To close this gap, we +introduce PhysBench, a comprehensive benchmark designed to evaluate VLMs' +physical world understanding capability across a diverse set of tasks. +PhysBench contains 100,000 entries of interleaved video-image-text data, +categorized into four major domains: physical object properties, physical +object relationships, physical scene understanding, and physics-based dynamics, +further divided into 19 subclasses and 8 distinct capability dimensions. Our +extensive experiments, conducted on 75 representative VLMs, reveal that while +these models excel in common-sense reasoning, they struggle with understanding +the physical world -- likely due to the absence of physical knowledge in their +training data and the lack of embedded physical priors. To tackle the +shortfall, we introduce PhysAgent, a novel framework that combines the +generalization strengths of VLMs with the specialized expertise of vision +models, significantly enhancing VLMs' physical understanding across a variety +of tasks, including an 18.4\% improvement on GPT-4o. Furthermore, our results +demonstrate that enhancing VLMs' physical world understanding capabilities can +help embodied agents such as MOKA. We believe that PhysBench and PhysAgent +offer valuable insights and contribute to bridging the gap between VLMs and +physical world understanding. + +
+
+ comment: ICLR 2025. Project page: https://physbench.github.io/; Dataset: + https://huggingface.co/datasets/USC-GVL/PhysBench; +
+
+
+
+
+ + ☆ DynAlign: Unsupervised Dynamic Taxonomy Alignment for Cross-Domain + Segmentation + + +
+ Current unsupervised domain adaptation (UDA) methods for semantic +segmentation typically assume identical class labels between the source and +target domains. This assumption ignores the label-level domain gap, which is +common in real-world scenarios, thus limiting their ability to identify +finer-grained or novel categories without requiring extensive manual +annotation. A promising direction to address this limitation lies in recent +advancements in foundation models, which exhibit strong generalization +abilities due to their rich prior knowledge. However, these models often +struggle with domain-specific nuances and underrepresented fine-grained +categories. + To address these challenges, we introduce DynAlign, a framework that +integrates UDA with foundation models to bridge both the image-level and +label-level domain gaps. Our approach leverages prior semantic knowledge to +align source categories with target categories that can be novel, more +fine-grained, or named differently (e.g., vehicle to {car, truck, bus}). +Foundation models are then employed for precise segmentation and category +reassignment. To further enhance accuracy, we propose a knowledge fusion +approach that dynamically adapts to varying scene contexts. DynAlign generates +accurate predictions in a new target label space without requiring any manual +annotations, allowing seamless adaptation to new taxonomies through either +model retraining or direct inference. + Experiments on the street scene semantic segmentation benchmarks GTA to +Mapillary Vistas and GTA to IDD validate the effectiveness of our approach, +achieving a significant improvement over existing methods. Our code will be +publicly available. + +
+
+
+
+
+ + ♻ ☆ GUI-Bee: Align GUI Action Grounding to Novel Environments via Autonomous + Exploration + + +
+ Graphical User Interface (GUI) action grounding is a critical step in GUI +automation that maps language instructions to actionable elements on GUI +screens. Most recent works of GUI action grounding leverage large GUI datasets +to fine-tune MLLMs. However, the fine-tuning data always covers limited GUI +environments, and we find the performance of the resulting model deteriorates +in novel environments. We argue that the GUI grounding models should be further +aligned to the novel environments to reveal their full potential, when the +inference is known to involve novel environments, i.e., environments not used +during the previous fine-tuning. To realize this, we first propose GUI-Bee, an +MLLM-based autonomous agent, to collect high-quality, environment-specific data +through exploration and then continuously fine-tune GUI grounding models with +the collected data. Our agent leverages a novel Q-value-Incentive In-Context +Reinforcement Learning (Q-ICRL) method to optimize exploration efficiency and +data quality. Additionally, we introduce NovelScreenSpot, a benchmark for +testing how well the data can help align GUI action grounding models to novel +environments and demonstrate the effectiveness of data collected by GUI-Bee in +the experiments. Furthermore, we conduct an ablation study to validate the +Q-ICRL method in enhancing the efficiency of GUI-Bee. Project page: +https://gui-bee.github.io + +
+
+
+
+
+ + ♻ ☆ MedPromptX: Grounded Multimodal Prompting for Chest X-ray Diagnosis + + +
+ Chest X-ray images are commonly used for predicting acute and chronic +cardiopulmonary conditions, but efforts to integrate them with structured +clinical data face challenges due to incomplete electronic health records +(EHR). This paper introduces MedPromptX, the first clinical decision support +system that integrates multimodal large language models (MLLMs), few-shot +prompting (FP) and visual grounding (VG) to combine imagery with EHR data for +chest X-ray diagnosis. A pre-trained MLLM is utilized to complement the missing +EHR information, providing a comprehensive understanding of patients' medical +history. Additionally, FP reduces the necessity for extensive training of MLLMs +while effectively tackling the issue of hallucination. Nevertheless, the +process of determining the optimal number of few-shot examples and selecting +high-quality candidates can be burdensome, yet it profoundly influences model +performance. Hence, we propose a new technique that dynamically refines +few-shot data for real-time adjustment to new patient scenarios. Moreover, VG +narrows the search area in X-ray images, thereby enhancing the identification +of abnormalities. We also release MedPromptX-VQA, a new in-context visual +question answering dataset encompassing interleaved images and EHR data derived +from MIMIC-IV and MIMIC-CXR-JPG databases. Results demonstrate the SOTA +performance of MedPromptX, achieving an 11% improvement in F1-score compared to +the baselines. Code and data are publicly available on +https://github.com/BioMedIA-MBZUAI/MedPromptX. + +
+
+
+
+
+ + ♻ ☆ PEP-GS: Perceptually-Enhanced Precise Structured 3D Gaussians for + View-Adaptive Rendering + + +
+ Recently, 3D Gaussian Splatting (3D-GS) has achieved significant success in +real-time, high-quality 3D scene rendering. However, it faces several +challenges, including Gaussian redundancy, limited ability to capture +view-dependent effects, and difficulties in handling complex lighting and +specular reflections. Additionally, methods that use spherical harmonics for +color representation often struggle to effectively capture specular highlights +and anisotropic components, especially when modeling view-dependent colors +under complex lighting conditions, leading to insufficient contrast and +unnatural color saturation. To address these limitations, we introduce PEP-GS, +a perceptually-enhanced framework that dynamically predicts Gaussian +attributes, including opacity, color, and covariance. We replace traditional +spherical harmonics with a Hierarchical Granular-Structural Attention +mechanism, which enables more accurate modeling of complex view-dependent color +effects and specular highlights. By employing a stable and interpretable +framework for opacity and covariance estimation, PEP-GS avoids the removal of +essential Gaussians prematurely, ensuring a more accurate scene representation. +Furthermore, perceptual optimization is applied to the final rendered images, +enhancing perceptual consistency across different views and ensuring +high-quality renderings with improved texture fidelity and fine-scale detail +preservation. Experimental results demonstrate that PEP-GS outperforms +state-of-the-art methods, particularly in challenging scenarios involving +view-dependent effects, specular reflections, and fine-scale details. + +
+
+
+
+
+ + ♻ ☆ 2.5 Years in Class: A Multimodal Textbook for Vision-Language + Pretraining + + +
+ Compared to image-text pair data, interleaved corpora enable Vision-Language +Models (VLMs) to understand the world more naturally like humans. However, such +existing datasets are crawled from webpage, facing challenges like low +knowledge density, loose image-text relations, and poor logical coherence +between images. On the other hand, the internet hosts vast instructional videos +(e.g., online geometry courses) that are widely used by humans to learn +foundational subjects, yet these valuable resources remain underexplored in VLM +training. In this paper, we introduce a high-quality \textbf{multimodal +textbook} corpus with richer foundational knowledge for VLM pretraining. It +collects over 2.5 years of instructional videos, totaling 22,000 class hours. +We first use an LLM-proposed taxonomy to systematically gather instructional +videos. Then we progressively extract and refine visual (keyframes), audio +(ASR), and textual knowledge (OCR) from the videos, and organize as an +image-text interleaved corpus based on temporal order. Compared to its +counterparts, our video-centric textbook offers more coherent context, richer +knowledge, and better image-text alignment. Experiments demonstrate its superb +pretraining performance, particularly in knowledge- and reasoning-intensive +tasks like ScienceQA and MathVista. Moreover, VLMs pre-trained on our textbook +exhibit outstanding interleaved context awareness, leveraging visual and +textual cues in their few-shot context for task solving. Our code are available +at https://github.com/DAMO-NLP-SG/multimodal_textbook. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ From Dashcam Videos to Driving Simulations: Stress Testing Automated + Vehicles against Rare Events + + +
+ Testing Automated Driving Systems (ADS) in simulation with realistic driving +scenarios is important for verifying their performance. However, converting +real-world driving videos into simulation scenarios is a significant challenge +due to the complexity of interpreting high-dimensional video data and the +time-consuming nature of precise manual scenario reconstruction. In this work, +we propose a novel framework that automates the conversion of real-world car +crash videos into detailed simulation scenarios for ADS testing. Our approach +leverages prompt-engineered Video Language Models(VLM) to transform dashcam +footage into SCENIC scripts, which define the environment and driving behaviors +in the CARLA simulator, enabling the generation of realistic simulation +scenarios. Importantly, rather than solely aiming for one-to-one scenario +reconstruction, our framework focuses on capturing the essential driving +behaviors from the original video while offering flexibility in parameters such +as weather or road conditions to facilitate search-based testing. Additionally, +we introduce a similarity metric that helps iteratively refine the generated +scenario through feedback by comparing key features of driving behaviors +between the real and simulated videos. Our preliminary results demonstrate +substantial time efficiency, finishing the real-to-sim conversion in minutes +with full automation and no human intervention, while maintaining high fidelity +to the original driving events. + +
+
+
+
+
+ + ♻ ☆ MoColl: Agent-Based Specific and General Model Collaboration for Image + Captioning + + +
+ Image captioning is a critical task at the intersection of computer vision +and natural language processing, with wide-ranging applications across various +domains. For complex tasks such as diagnostic report generation, deep learning +models require not only domain-specific image-caption datasets but also the +incorporation of relevant general knowledge to provide contextual accuracy. +Existing approaches exhibit inherent limitations: specialized models excel in +capturing domain-specific details but lack generalization, while +vision-language models (VLMs) built on large language models (LLMs) leverage +general knowledge but struggle with domain-specific adaptation. To address +these limitations, this paper proposes a novel agent-enhanced model +collaboration framework, which we call MoColl, designed to effectively +integrate domain-specific and general knowledge. Specifically, our approach is +to decompose complex image captioning tasks into a series of interconnected +question-answer subtasks. A trainable visual question answering (VQA) model is +employed as a specialized tool to focus on domain-specific visual analysis, +answering task-specific questions based on image content. Concurrently, an +LLM-based agent with general knowledge formulates these questions and +synthesizes the resulting question-answer pairs into coherent captions. Beyond +its role in leveraging the VQA model, the agent further guides its training to +enhance its domain-specific capabilities. Experimental results on radiology +report generation validate the effectiveness of the proposed framework, +demonstrating significant improvements in the quality of generated reports. + +
+
+
+
+
+ + ♻ ☆ Text-driven Adaptation of Foundation Models for Few-shot Surgical + Workflow Analysis + + +
+ Purpose: Surgical workflow analysis is crucial for improving surgical +efficiency and safety. However, previous studies rely heavily on large-scale +annotated datasets, posing challenges in cost, scalability, and reliance on +expert annotations. To address this, we propose Surg-FTDA (Few-shot Text-driven +Adaptation), designed to handle various surgical workflow analysis tasks with +minimal paired image-label data. + Methods: Our approach has two key components. First, Few-shot selection-based +modality alignment selects a small subset of images and aligns their embeddings +with text embeddings from the downstream task, bridging the modality gap. +Second, Text-driven adaptation leverages only text data to train a decoder, +eliminating the need for paired image-text data. This decoder is then applied +to aligned image embeddings, enabling image-related tasks without explicit +image-text pairs. + Results: We evaluate our approach to generative tasks (image captioning) and +discriminative tasks (triplet recognition and phase recognition). Results show +that Surg-FTDA outperforms baselines and generalizes well across downstream +tasks. + Conclusion: We propose a text-driven adaptation approach that mitigates the +modality gap and handles multiple downstream tasks in surgical workflow +analysis, with minimal reliance on large annotated datasets. The code and +dataset will be released in https://github.com/CAMMA-public/Surg-FTDA + +
+
+
+
+
+ + ♻ ☆ VCRScore: Image captioning metric based on V\&L Transformers, CLIP, and + precision-recall + + +
+ Image captioning has become an essential Vision & Language research task. It +is about predicting the most accurate caption given a specific image or video. +The research community has achieved impressive results by continuously +proposing new models and approaches to improve the overall model's performance. +Nevertheless, despite increasing proposals, the performance metrics used to +measure their advances have remained practically untouched through the years. A +probe of that, nowadays metrics like BLEU, METEOR, CIDEr, and ROUGE are still +very used, aside from more sophisticated metrics such as BertScore and +ClipScore. + Hence, it is essential to adjust how are measure the advances, limitations, +and scopes of the new image captioning proposals, as well as to adapt new +metrics to these new advanced image captioning approaches. + This work proposes a new evaluation metric for the image captioning problem. +To do that, first, it was generated a human-labeled dataset to assess to which +degree the captions correlate with the image's content. Taking these human +scores as ground truth, we propose a new metric, and compare it with several +well-known metrics, from classical to newer ones. Outperformed results were +also found, and interesting insights were presented and discussed. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ♻ ☆ Learning Point Spread Function Invertibility Assessment for Image + Deconvolution + + +
+ Deep-learning (DL)-based image deconvolution (ID) has exhibited remarkable +recovery performance, surpassing traditional linear methods. However, unlike +traditional ID approaches that rely on analytical properties of the point +spread function (PSF) to achieve high recovery performance - such as specific +spectrum properties or small conditional numbers in the convolution matrix - DL +techniques lack quantifiable metrics for evaluating PSF suitability for +DL-assisted recovery. Aiming to enhance deconvolution quality, we propose a +metric that employs a non-linear approach to learn the invertibility of an +arbitrary PSF using a neural network by mapping it to a unit impulse. A lower +discrepancy between the mapped PSF and a unit impulse indicates a higher +likelihood of successful inversion by a DL network. Our findings reveal that +this metric correlates with high recovery performance in DL and traditional +methods, thereby serving as an effective regularizer in deconvolution tasks. +This approach reduces the computational complexity over conventional condition +number assessments and is a differentiable process. These useful properties +allow its application in designing diffractive optical elements through +end-to-end (E2E) optimization, achieving invertible PSFs, and outperforming the +E2E baseline framework. + +
+
+ comment: Accepted at the 2024 32nd European Signal Processing Conference + (EUSIPCO), 2024 +
+
+
+
+
+ + ♻ ☆ 3DGS$^2$: Near Second-order Converging 3D Gaussian Splatting + + +
+ 3D Gaussian Splatting (3DGS) has emerged as a mainstream solution for novel +view synthesis and 3D reconstruction. By explicitly encoding a 3D scene using a +collection of Gaussian kernels, 3DGS achieves high-quality rendering with +superior efficiency. As a learning-based approach, 3DGS training has been dealt +with the standard stochastic gradient descent (SGD) method, which offers at +most linear convergence. Consequently, training often requires tens of minutes, +even with GPU acceleration. This paper introduces a (near) second-order +convergent training algorithm for 3DGS, leveraging its unique properties. Our +approach is inspired by two key observations. First, the attributes of a +Gaussian kernel contribute independently to the image-space loss, which +endorses isolated and local optimization algorithms. We exploit this by +splitting the optimization at the level of individual kernel attributes, +analytically constructing small-size Newton systems for each parameter group, +and efficiently solving these systems on GPU threads. This achieves Newton-like +convergence per training image without relying on the global Hessian. Second, +kernels exhibit sparse and structured coupling across input images. This +property allows us to effectively utilize spatial information to mitigate +overshoot during stochastic training. Our method converges an order faster than +standard GPU-based 3DGS training, requiring over $10\times$ fewer iterations +while maintaining or surpassing the quality of the compared with the SGD-based +3DGS reconstructions. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ MADation: Face Morphing Attack Detection with Foundation Models + + +
+ Despite the considerable performance improvements of face recognition +algorithms in recent years, the same scientific advances responsible for this +progress can also be used to create efficient ways to attack them, posing a +threat to their secure deployment. Morphing attack detection (MAD) systems aim +to detect a specific type of threat, morphing attacks, at an early stage, +preventing them from being considered for verification in critical processes. +Foundation models (FM) learn from extensive amounts of unlabelled data, +achieving remarkable zero-shot generalization to unseen domains. Although this +generalization capacity might be weak when dealing with domain-specific +downstream tasks such as MAD, FMs can easily adapt to these settings while +retaining the built-in knowledge acquired during pre-training. In this work, we +recognize the potential of FMs to perform well in the MAD task when properly +adapted to its specificities. To this end, we adapt FM CLIP architectures with +LoRA weights while simultaneously training a classification header. The +proposed framework, MADation surpasses our alternative FM and transformer-based +frameworks and constitutes the first adaption of FMs to the MAD task. MADation +presents competitive results with current MAD solutions in the literature and +even surpasses them in several evaluation scenarios. To encourage +reproducibility and facilitate further research in MAD, we publicly release the +implementation of MADation at https://github.com/gurayozgur/MADation + +
+
+ comment: Accepted at WACV 2025 workshops +
+
+
+
+
+ + ♻ ☆ CAFuser: Condition-Aware Multimodal Fusion for Robust Semantic + Perception of Driving Scenes + + +
+ Leveraging multiple sensors is crucial for robust semantic perception in +autonomous driving, as each sensor type has complementary strengths and +weaknesses. However, existing sensor fusion methods often treat sensors +uniformly across all conditions, leading to suboptimal performance. By +contrast, we propose a novel, condition-aware multimodal fusion approach for +robust semantic perception of driving scenes. Our method, CAFuser, uses an RGB +camera input to classify environmental conditions and generate a Condition +Token that guides the fusion of multiple sensor modalities. We further newly +introduce modality-specific feature adapters to align diverse sensor inputs +into a shared latent space, enabling efficient integration with a single and +shared pre-trained backbone. By dynamically adapting sensor fusion based on the +actual condition, our model significantly improves robustness and accuracy, +especially in adverse-condition scenarios. CAFuser ranks first on the public +MUSES benchmarks, achieving 59.7 PQ for multimodal panoptic and 78.2 mIoU for +semantic segmentation, and also sets the new state of the art on DeLiVER. The +source code is publicly available at: https://github.com/timbroed/CAFuser. + +
+
+ comment: IEEE Robotics and Automation Letters, The source code is publicly + available at: https://github.com/timbroed/CAFuser +
+
+
+
+
+ + ♻ ☆ Segmentation Dataset for Reinforced Concrete Construction + + +
+ This paper provides a dataset of 14,805 RGB images with segmentation labels +for autonomous robotic inspection of reinforced concrete defects. Baselines for +the YOLOv8L-seg, DeepLabV3, and U-Net segmentation models are established. +Labelling inconsistencies are addressed statistically, and their influence on +model performance is analyzed. An error identification tool is employed to +examine the error modes of the models. The paper demonstrates that YOLOv8L-seg +performs best, achieving a validation mIOU score of up to 0.59. Label +inconsistencies were found to have a negligible effect on model performance, +while the inclusion of more data improved the performance. False negatives were +identified as the primary failure mode. The results highlight the importance of +data availability for the performance of deep learning-based models. The lack +of publicly available data is identified as a significant contributor to false +negatives. To address this, the paper advocates for an increased open-source +approach within the construction community. + +
+
+ comment: The ConRebSeg Dataset can be found under the following DOI: + https://doi.org/10.11583/DTU.26213762 Corresponding code to download + additional data and initialize the dataset under + https://github.com/DTU-PAS/ConRebSeg This work is an accepted manuscript up + for publication in the Elsevier journal "Automation in Construction" +
+
+
+
+
+ + ♻ ☆ Accelerating lensed quasar discovery and modeling with physics-informed + variational autoencoders + + +
+ Strongly lensed quasars provide valuable insights into the rate of cosmic +expansion, the distribution of dark matter in foreground deflectors, and the +characteristics of quasar hosts. However, detecting them in astronomical images +is difficult due to the prevalence of non-lensing objects. To address this +challenge, we developed a generative deep learning model called VariLens, built +upon a physics-informed variational autoencoder. This model seamlessly +integrates three essential modules: image reconstruction, object +classification, and lens modeling, offering a fast and comprehensive approach +to strong lens analysis. VariLens is capable of rapidly determining both (1) +the probability that an object is a lens system and (2) key parameters of a +singular isothermal ellipsoid (SIE) mass model -- including the Einstein radius +($\theta_\mathrm{E}$), lens center, and ellipticity -- in just milliseconds +using a single CPU. A direct comparison of VariLens estimates with traditional +lens modeling for 20 known lensed quasars within the Subaru Hyper Suprime-Cam +(HSC) footprint shows good agreement, with both results consistent within +$2\sigma$ for systems with $\theta_\mathrm{E}<3$ arcsecs. To identify new +lensed quasar candidates, we begin with an initial sample of approximately 80 +million sources, combining HSC data with multiwavelength information from +various surveys. After applying a photometric preselection aimed at locating +$z>1.5$ sources, the number of candidates was reduced to 710,966. Subsequently, +VariLens highlights 13,831 sources, each showing a high likelihood of being a +lens. A visual assessment of these objects results in 42 promising candidates +that await spectroscopic confirmation. These results underscore the potential +of automated deep learning pipelines to efficiently detect and model strong +lenses in large datasets. + +
+
+ comment: Accepted for publication in the Astronomy & Astrophysics journal and + updated to reflect the revised version. The paper consists of 15 main pages, + 12 figures, and 1 table. We welcome feedback and comments from readers! +
+
+
+
+
+ + ♻ ☆ Dimensions underlying the representational alignment of deep neural + networks with humans + + +
+ Determining the similarities and differences between humans and artificial +intelligence (AI) is an important goal both in computational cognitive +neuroscience and machine learning, promising a deeper understanding of human +cognition and safer, more reliable AI systems. Much previous work comparing +representations in humans and AI has relied on global, scalar measures to +quantify their alignment. However, without explicit hypotheses, these measures +only inform us about the degree of alignment, not the factors that determine +it. To address this challenge, we propose a generic framework to compare human +and AI representations, based on identifying latent representational dimensions +underlying the same behavior in both domains. Applying this framework to humans +and a deep neural network (DNN) model of natural images revealed a +low-dimensional DNN embedding of both visual and semantic dimensions. In +contrast to humans, DNNs exhibited a clear dominance of visual over semantic +properties, indicating divergent strategies for representing images. While +in-silico experiments showed seemingly consistent interpretability of DNN +dimensions, a direct comparison between human and DNN representations revealed +substantial differences in how they process images. By making representations +directly comparable, our results reveal important challenges for +representational alignment and offer a means for improving their comparability. + +
+
+
+
+
+ + ♻ ☆ Multi-Tiered Self-Contrastive Learning for Medical Microwave Radiometry + (MWR) Breast Cancer Detection + + +
+ Improving breast cancer detection and monitoring techniques is a critical +objective in healthcare, driving the need for innovative imaging technologies +and diagnostic approaches. This study introduces a novel multi-tiered +self-contrastive model tailored for microwave radiometry (MWR) in breast cancer +detection. Our approach incorporates three distinct models: Local-MWR (L-MWR), +Regional-MWR (R-MWR), and Global-MWR (G-MWR), designed to analyze varying +sub-regional comparisons within the breasts. These models are integrated +through the Joint-MWR (J-MWR) network, which leverages self-contrastive results +at each analytical level to improve diagnostic accuracy. Utilizing a dataset of +4,932 female patients, our research demonstrates the efficacy of our proposed +models. Notably, the J-MWR model achieves a Matthew's correlation coefficient +of 0.74 $\pm$ 0.018, surpassing existing MWR neural networks and contrastive +methods. These findings highlight the potential of self-contrastive learning +techniques in improving the diagnostic accuracy and generalizability for +MWR-based breast cancer detection. This advancement holds considerable promise +for future investigations into enabling point-of-care testing. The source code +is available at: https://github.com/cgalaz01/self_contrastive_mwr. + +
+
+
+
+
+ + ♻ ☆ Textualize Visual Prompt for Image Editing via Diffusion Bridge + + +
+ Visual prompt, a pair of before-and-after edited images, can convey +indescribable imagery transformations and prosper in image editing. However, +current visual prompt methods rely on a pretrained text-guided image-to-image +generative model that requires a triplet of text, before, and after images for +retraining over a text-to-image model. Such crafting triplets and retraining +processes limit the scalability and generalization of editing. In this paper, +we present a framework based on any single text-to-image model without reliance +on the explicit image-to-image model thus enhancing the generalizability and +scalability. Specifically, by leveraging the probability-flow ordinary +equation, we construct a diffusion bridge to transfer the distribution between +before-and-after images under the text guidance. By optimizing the text via the +bridge, the framework adaptively textualizes the editing transformation +conveyed by visual prompts into text embeddings without other models. +Meanwhile, we introduce differential attention control during text +optimization, which disentangles the text embedding from the invariance of the +before-and-after images and makes it solely capture the delicate transformation +and generalize to edit various images. Experiments on real images validate +competitive results on the generalization, contextual coherence, and high +fidelity for delicate editing with just one image pair as the visual prompt. + +
+
+ comment: AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Implicit Location-Caption Alignment via Complementary Masking for + Weakly-Supervised Dense Video Captioning + + +
+ Weakly-Supervised Dense Video Captioning (WSDVC) aims to localize and +describe all events of interest in a video without requiring annotations of +event boundaries. This setting poses a great challenge in accurately locating +the temporal location of event, as the relevant supervision is unavailable. +Existing methods rely on explicit alignment constraints between event locations +and captions, which involve complex event proposal procedures during both +training and inference. To tackle this problem, we propose a novel implicit +location-caption alignment paradigm by complementary masking, which simplifies +the complex event proposal and localization process while maintaining +effectiveness. Specifically, our model comprises two components: a dual-mode +video captioning module and a mask generation module. The dual-mode video +captioning module captures global event information and generates descriptive +captions, while the mask generation module generates differentiable positive +and negative masks for localizing the events. These masks enable the implicit +alignment of event locations and captions by ensuring that captions generated +from positively and negatively masked videos are complementary, thereby forming +a complete video description. In this way, even under weak supervision, the +event location and event caption can be aligned implicitly. Extensive +experiments on the public datasets demonstrate that our method outperforms +existing weakly-supervised methods and achieves competitive results compared to +fully-supervised methods. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Benchmarking Vision Foundation Models for Input Monitoring in Autonomous + Driving + + +
+ Deep neural networks (DNNs) remain challenged by distribution shifts in +complex open-world domains like automated driving (AD): Absolute robustness +against yet unknown novel objects (semantic shift) or styles like lighting +conditions (covariate shift) cannot be guaranteed. Hence, reliable +operation-time monitors for identification of out-of-training-data-distribution +(OOD) scenarios are imperative. Current approaches for OOD classification are +untested for complex domains like AD, are limited in the kinds of shifts they +detect, or even require supervision with OOD samples. To prepare for +unanticipated shifts, we instead establish a framework around a principled, +unsupervised, and model-agnostic method that unifies detection of all kinds of +shifts: Find a full model of the training data's feature distribution, to then +use its density at new points as in-distribution (ID) score. To implement this, +we propose to combine the newly available Vision Foundation Models (VFM) as +feature extractors with one of four alternative density modeling techniques. In +an extensive benchmark of 4 VFMs against 20 baselines, we show the superior +performance of VFM feature encodings compared to shift-specific OOD monitors. +Additionally, we find that sophisticated architectures outperform larger latent +space dimensionality; and our method identifies samples with higher risk of +errors on downstream tasks, despite being model-agnostic. This suggests that +VFMs are promising to realize model-agnostic, unsupervised, reliable safety +monitors in complex vision tasks. + +
+
+
+
+
+ + ♻ ☆ Towards Kriging-informed Conditional Diffusion for Regional Sea-Level + Data Downscaling + + +
+ Given coarser-resolution projections from global climate models or satellite +data, the downscaling problem aims to estimate finer-resolution regional +climate data, capturing fine-scale spatial patterns and variability. +Downscaling is any method to derive high-resolution data from low-resolution +variables, often to provide more detailed and local predictions and analyses. +This problem is societally crucial for effective adaptation, mitigation, and +resilience against significant risks from climate change. The challenge arises +from spatial heterogeneity and the need to recover finer-scale features while +ensuring model generalization. Most downscaling methods \cite{Li2020} fail to +capture the spatial dependencies at finer scales and underperform on real-world +climate datasets, such as sea-level rise. We propose a novel Kriging-informed +Conditional Diffusion Probabilistic Model (Ki-CDPM) to capture spatial +variability while preserving fine-scale features. Experimental results on +climate data show that our proposed method is more accurate than +state-of-the-art downscaling techniques. + +
+
+
+
+
+ + ♻ ☆ SpectralKD: A Unified Framework for Interpreting and Distilling Vision + Transformers via Spectral Analysis + + +
+ Knowledge Distillation (KD) has achieved widespread success in compressing +large Vision Transformers (ViTs), but a unified theoretical framework for both +ViTs and KD is still lacking. In this paper, we propose SpectralKD, a novel +unified analytical framework that offers deeper insights into ViTs and +optimizes KD via spectral analysis. Our model-wise analysis reveals that CaiT +concentrates information in their first and last few layers, informing optimal +layer selection for KD. Surprisingly, our layer-wise analysis discovers that +Swin Transformer and CaiT exhibit similar spectral encoding patterns despite +their architectural differences, leading to feature map alignment guideline. +Building on these insights, we propose a simple yet effective spectral +alignment method for KD. Benefiting from the deeper understanding by above +analysis results, even such a simple strategy achieves state-of-the-art +performance on ImageNet-1K without introducing any trainable parameters, +improving DeiT-Tiny by $+5.2\%$ and Swin-Tiny by $+1.4\%$ in top-1 accuracy. +Furthermore, our post-training analysis reveals that distilled students can +reproduce spectral patterns similar to their teachers, opening a new area we +term ``distillation dynamics". Code and experimental logs are available in +https://github.com/thy960112/SpectralKD. + +
+
+
+
+
+ + ♻ ☆ FedDAG: Federated Domain Adversarial Generation Towards Generalizable + Medical Image Analysis + + +
+ Federated domain generalization aims to train a global model from multiple +source domains and ensure its generalization ability to unseen target domains. +Due to the target domain being with unknown domain shifts, attempting to +approximate these gaps by source domains may be the key to improving model +generalization capability. Existing works mainly focus on sharing and +recombining local domain-specific attributes to increase data diversity and +simulate potential domain shifts. However, these methods may be insufficient +since only the local attribute recombination can be hard to touch the +out-of-distribution of global data. In this paper, we propose a +simple-yet-efficient framework named Federated Domain Adversarial Generation +(FedDAG). It aims to simulate the domain shift and improve the model +generalization by adversarially generating novel domains different from local +and global source domains. Specifically, it generates novel-style images by +maximizing the instance-level feature discrepancy between original and +generated images and trains a generalizable task model by minimizing their +feature discrepancy. Further, we observed that FedDAG could cause different +performance improvements for local models. It may be due to inherent data +isolation and heterogeneity among clients, exacerbating the imbalance in their +generalization contributions to the global model. Ignoring this imbalance can +lead the global model's generalization ability to be sub-optimal, further +limiting the novel domain generation procedure. Thus, to mitigate this +imbalance, FedDAG hierarchically aggregates local models at the within-client +and across-client levels by using the sharpness concept to evaluate client +model generalization contributions. Extensive experiments across four medical +benchmarks demonstrate FedDAG's ability to enhance generalization in federated +medical scenarios. + +
+
+
+
+
+ + ♻ ☆ Task Me Anything NeurIPS 2024 + + +
+ Benchmarks for large multimodal language models (MLMs) now serve to +simultaneously assess the general capabilities of models instead of evaluating +for a specific capability. As a result, when a developer wants to identify +which models to use for their application, they are overwhelmed by the number +of benchmarks and remain uncertain about which benchmark's results are most +reflective of their specific use case. This paper introduces Task-Me-Anything, +a benchmark generation engine which produces a benchmark tailored to a user's +needs. Task-Me-Anything maintains an extendable taxonomy of visual assets and +can programmatically generate a vast number of task instances. Additionally, it +algorithmically addresses user queries regarding MLM performance efficiently +within a computational budget. It contains 113K images, 10K videos, 2K 3D +object assets, over 365 object categories, 655 attributes, and 335 +relationships. It can generate 750M image/video question-answering pairs, which +focus on evaluating MLM perceptual capabilities. Task-Me-Anything reveals +critical insights: open-source MLMs excel in object and attribute recognition +but lack spatial and temporal understanding; each model exhibits unique +strengths and weaknesses; larger models generally perform better, though +exceptions exist; and GPT4o demonstrates challenges in recognizing +rotating/moving objects and distinguishing colors. + +
+
+ comment: NeurIPS 2024 Track on Datasets and Benchmarks. Website: + https://www.task-me-anything.org +
+
+
+
+
+ + ♻ ☆ Make-A-Texture: Fast Shape-Aware Texture Generation in 3 Seconds + + +
+ We present Make-A-Texture, a new framework that efficiently synthesizes +high-resolution texture maps from textual prompts for given 3D geometries. Our +approach progressively generates textures that are consistent across multiple +viewpoints with a depth-aware inpainting diffusion model, in an optimized +sequence of viewpoints determined by an automatic view selection algorithm. + A significant feature of our method is its remarkable efficiency, achieving a +full texture generation within an end-to-end runtime of just 3.07 seconds on a +single NVIDIA H100 GPU, significantly outperforming existing methods. Such an +acceleration is achieved by optimizations in the diffusion model and a +specialized backprojection method. Moreover, our method reduces the artifacts +in the backprojection phase, by selectively masking out non-frontal faces, and +internal faces of open-surfaced objects. + Experimental results demonstrate that Make-A-Texture matches or exceeds the +quality of other state-of-the-art methods. Our work significantly improves the +applicability and practicality of texture generation models for real-world 3D +content creation, including interactive creation and text-guided texture +editing. + +
+
+ comment: Accepted to WACV 2025 Webpage: + https://mukosame.github.io/make-a-texture/ Video: + https://www.youtube.com/watch?v=2Ctqdx1uaj0 +
+
+
+
+
+ + ♻ ☆ EasySplat: View-Adaptive Learning makes 3D Gaussian Splatting Easy + + +
+ 3D Gaussian Splatting (3DGS) techniques have achieved satisfactory 3D scene +representation. Despite their impressive performance, they confront challenges +due to the limitation of structure-from-motion (SfM) methods on acquiring +accurate scene initialization, or the inefficiency of densification strategy. +In this paper, we introduce a novel framework EasySplat to achieve high-quality +3DGS modeling. Instead of using SfM for scene initialization, we employ a novel +method to release the power of large-scale pointmap approaches. Specifically, +we propose an efficient grouping strategy based on view similarity, and use +robust pointmap priors to obtain high-quality point clouds and camera poses for +3D scene initialization. After obtaining a reliable scene structure, we propose +a novel densification approach that adaptively splits Gaussian primitives based +on the average shape of neighboring Gaussian ellipsoids, utilizing KNN scheme. +In this way, the proposed method tackles the limitation on initialization and +optimization, leading to an efficient and accurate 3DGS modeling. Extensive +experiments demonstrate that EasySplat outperforms the current state-of-the-art +(SOTA) in handling novel view synthesis. + +
+
+ comment: 6 pages, 5figures +
+
+
+
+
+ + ♻ ☆ A New Cross-Space Total Variation Regularization Model for Color Image + Restoration with Quaternion Blur Operator + + +
+ The cross-channel deblurring problem in color image processing is difficult +to solve due to the complex coupling and structural blurring of color pixels. +Until now, there are few efficient algorithms that can reduce color artifacts +in deblurring process. To solve this challenging problem, we present a novel +cross-space total variation (CSTV) regularization model for color image +deblurring by introducing a quaternion blur operator and a cross-color space +regularization functional. The existence and uniqueness of the solution are +proved and a new L-curve method is proposed to find a balance of regularization +terms on different color spaces. The Euler-Lagrange equation is derived to show +that CSTV has taken into account the coupling of all color channels and the +local smoothing within each color channel. A quaternion operator splitting +method is firstly proposed to enhance the ability of color artifacts reduction +of the CSTV regularization model. This strategy also applies to the well-known +color deblurring models. Numerical experiments on color image databases +illustrate the efficiency and effectiveness of the new model and algorithms. +The color images restored by them successfully maintain the color and spatial +information and are of higher quality in terms of PSNR, SSIM, MSE and CIEde2000 +than the restorations of the-state-of-the-art methods. + +
+
+ comment: 15pages,14figures +
+
+
+
+
+ + ♻ ☆ E2E-MFD: Towards End-to-End Synchronous Multimodal Fusion Detection + + +
+ Multimodal image fusion and object detection are crucial for autonomous +driving. While current methods have advanced the fusion of texture details and +semantic information, their complex training processes hinder broader +applications. Addressing this challenge, we introduce E2E-MFD, a novel +end-to-end algorithm for multimodal fusion detection. E2E-MFD streamlines the +process, achieving high performance with a single training phase. It employs +synchronous joint optimization across components to avoid suboptimal solutions +tied to individual tasks. Furthermore, it implements a comprehensive +optimization strategy in the gradient matrix for shared parameters, ensuring +convergence to an optimal fusion detection configuration. Our extensive testing +on multiple public datasets reveals E2E-MFD's superior capabilities, showcasing +not only visually appealing image fusion but also impressive detection +outcomes, such as a 3.9% and 2.0% mAP50 increase on horizontal object detection +dataset M3FD and oriented object detection dataset DroneVehicle, respectively, +compared to state-of-the-art approaches. The code is released at +https://github.com/icey-zhang/E2E-MFD. + +
+
+
+
+
+ + ♻ ☆ Deciphering Oracle Bone Language with Diffusion Models + + +
+ Originating from China's Shang Dynasty approximately 3,000 years ago, the +Oracle Bone Script (OBS) is a cornerstone in the annals of linguistic history, +predating many established writing systems. Despite the discovery of thousands +of inscriptions, a vast expanse of OBS remains undeciphered, casting a veil of +mystery over this ancient language. The emergence of modern AI technologies +presents a novel frontier for OBS decipherment, challenging traditional NLP +methods that rely heavily on large textual corpora, a luxury not afforded by +historical languages. This paper introduces a novel approach by adopting image +generation techniques, specifically through the development of Oracle Bone +Script Decipher (OBSD). Utilizing a conditional diffusion-based strategy, OBSD +generates vital clues for decipherment, charting a new course for AI-assisted +analysis of ancient languages. To validate its efficacy, extensive experiments +were conducted on an oracle bone script dataset, with quantitative results +demonstrating the effectiveness of OBSD. Code and decipherment results will be +made available at https://github.com/guanhaisu/OBSD. + +
+
+ comment: ACL 2024 Best Paper +
+
+
+
+
+ + ♻ ☆ Nautilus: Locality-aware Autoencoder for Scalable Mesh Generation + + +
+ Triangle meshes are fundamental to 3D applications, enabling efficient +modification and rasterization while maintaining compatibility with standard +rendering pipelines. However, current automatic mesh generation methods +typically rely on intermediate representations that lack the continuous surface +quality inherent to meshes. Converting these representations into meshes +produces dense, suboptimal outputs. Although recent autoregressive approaches +demonstrate promise in directly modeling mesh vertices and faces, they are +constrained by the limitation in face count, scalability, and structural +fidelity. To address these challenges, we propose Nautilus, a locality-aware +autoencoder for artist-like mesh generation that leverages the local properties +of manifold meshes to achieve structural fidelity and efficient representation. +Our approach introduces a novel tokenization algorithm that preserves face +proximity relationships and compresses sequence length through locally shared +vertices and edges, enabling the generation of meshes with an unprecedented +scale of up to 5,000 faces. Furthermore, we develop a Dual-stream Point +Conditioner that provides multi-scale geometric guidance, ensuring global +consistency and local structural fidelity by capturing fine-grained geometric +features. Extensive experiments demonstrate that Nautilus significantly +outperforms state-of-the-art methods in both fidelity and scalability. The +project page will be released to https://nautilusmeshgen.github.io. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ TEOChat: A Large Vision-Language Assistant for Temporal Earth + Observation Data ICLR 2025 + + +
+ Large vision and language assistants have enabled new capabilities for +interpreting natural images. These approaches have recently been adapted to +earth observation data, but they are only able to handle single image inputs, +limiting their use for many real-world tasks. In this work, we develop a new +vision and language assistant called TEOChat that can engage in conversations +about temporal sequences of earth observation data. To train TEOChat, we curate +an instruction-following dataset composed of many single image and temporal +tasks including building change and damage assessment, semantic change +detection, and temporal scene classification. We show that TEOChat can perform +a wide variety of spatial and temporal reasoning tasks, substantially +outperforming previous vision and language assistants, and even achieving +comparable or better performance than several specialist models trained to +perform specific tasks. Furthermore, TEOChat achieves impressive zero-shot +performance on a change detection and change question answering dataset, +outperforms GPT-4o and Gemini 1.5 Pro on multiple temporal tasks, and exhibits +stronger single image capabilities than a comparable single image +instruction-following model on scene classification, visual question answering, +and captioning. We publicly release our data, model, and code at +https://github.com/ermongroup/TEOChat . + +
+
+ comment: Published at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Comprehensive Performance Evaluation of YOLO11, YOLOv10, YOLOv9 and + YOLOv8 on Detecting and Counting Fruitlet in Complex Orchard Environments + + +
+ This study extensively evaluated You Only Look Once (YOLO) object detection +algorithms across all configurations (total 22) of YOLOv8, YOLOv9, YOLOv10, and +YOLO11 (or YOLOv11) for green fruit detection in commercial orchards. The +research also validated in-field fruitlet counting using an iPhone and machine +vision sensors across four apple varieties: Scifresh, Scilate, Honeycrisp and +Cosmic Crisp. Among the 22 configurations evaluated, YOLOv11s and YOLOv9 +gelan-base outperformed others with mAP@50 scores of 0.933 and 0.935 +respectively. In terms of recall, YOLOv9 gelan-base achieved the highest value +among YOLOv9 configurations at 0.899, while YOLOv11m led YOLOv11 variants with +0.897. YOLO11n emerged as the fastest model, achieving fastest inference speed +of only 2.4 ms, significantly outpacing the leading configurations of YOLOv10n, +YOLOv9 gelan-s, and YOLOv8n, with speeds of 5.5, 11.5, and 4.1 ms, +respectively. This comparative evaluation highlights the strengths of YOLOv11, +YOLOv9, and YOLOv10, offering researchers essential insights to choose the +best-suited model for fruitlet detection and possible automation in commercial +orchards. For real-time automation related work in relevant datasets, we +recommend using YOLOv11n due to its high detection and image processing speed. +Keywords: YOLO11, YOLO11 Object Detection, YOLOv10, YOLOv9, YOLOv8, You Only +Look Once, Fruitlet Detection, Greenfruit Detection, YOLOv11 bounding box, +YOLOv11 detection, YOLOv11 object detection, YOLOv11 machine learning, YOLOv11 +Deep Learning + +
+
+ comment: 15 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Evaluation of GPT-4o and GPT-4o-mini's Vision Capabilities for + Compositional Analysis from Dried Solution Drops + + +
+ When microliter drops of salt solutions dry on non-porous surfaces, they form +erratic yet characteristic deposit patterns influenced by complex +crystallization dynamics and fluid motion. Using OpenAI's image-enabled +language models, we analyzed deposits from 12 salts with 200 images per salt +and per model. GPT-4o classified 57% of the salts accurately, significantly +outperforming random chance and GPT-4o mini. This study underscores the promise +of general-use AI tools for reliably identifying salts from their drying +patterns. + +
+
+ comment: 9 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Generative Adversarial Networks in Ultrasound Imaging: Extending Field + of View Beyond Conventional Limits + + +
+ Transthoracic Echocardiography (TTE) is a fundamental, non-invasive +diagnostic tool in cardiovascular medicine, enabling detailed visualization of +cardiac structures crucial for diagnosing various heart conditions. Despite its +widespread use, TTE ultrasound imaging faces inherent limitations, notably the +trade-off between field of view (FoV) and resolution. This paper introduces a +novel application of conditional Generative Adversarial Networks (cGANs), +specifically designed to extend the FoV in TTE ultrasound imaging while +maintaining high resolution. Our proposed cGAN architecture, termed echoGAN, +demonstrates the capability to generate realistic anatomical structures through +outpainting, effectively broadening the viewable area in medical imaging. This +advancement has the potential to enhance both automatic and manual ultrasound +navigation, offering a more comprehensive view that could significantly reduce +the learning curve associated with ultrasound imaging and aid in more accurate +diagnoses. The results confirm that echoGAN reliably reproduce detailed cardiac +features, thereby promising a significant step forward in the field of +non-invasive cardiac naviagation and diagnostics. + +
+
+
+
+
+ + ♻ ☆ MSDet: Receptive Field Enhanced Multiscale Detection for Tiny Pulmonary + Nodule + + +
+ Pulmonary nodules are critical indicators for the early diagnosis of lung +cancer, making their detection essential for timely treatment. However, +traditional CT imaging methods suffered from cumbersome procedures, low +detection rates, and poor localization accuracy. The subtle differences between +pulmonary nodules and surrounding tissues in complex lung CT images, combined +with repeated downsampling in feature extraction networks, often lead to missed +or false detections of small nodules. Existing methods such as FPN, with its +fixed feature fusion and limited receptive field, struggle to effectively +overcome these issues. To address these challenges, our paper proposed three +key contributions: Firstly, we proposed MSDet, a multiscale attention and +receptive field network for detecting tiny pulmonary nodules. Secondly, we +proposed the extended receptive domain (ERD) strategy to capture richer +contextual information and reduce false positives caused by nodule occlusion. +We also proposed the position channel attention mechanism (PCAM) to optimize +feature learning and reduce multiscale detection errors, and designed the tiny +object detection block (TODB) to enhance the detection of tiny nodules. Lastly, +we conducted thorough experiments on the public LUNA16 dataset, achieving +state-of-the-art performance, with an mAP improvement of 8.8% over the previous +state-of-the-art method YOLOv8. These advancements significantly boosted +detection accuracy and reliability, providing a more effective solution for +early lung cancer diagnosis. The code will be available at +https://github.com/CaiGuoHui123/MSDet + +
+
+
+
+
+ + ♻ ☆ Label-Efficient Data Augmentation with Video Diffusion Models for + Guidewire Segmentation in Cardiac Fluoroscopy + + +
+ The accurate segmentation of guidewires in interventional cardiac fluoroscopy +videos is crucial for computer-aided navigation tasks. Although deep learning +methods have demonstrated high accuracy and robustness in wire segmentation, +they require substantial annotated datasets for generalizability, underscoring +the need for extensive labeled data to enhance model performance. To address +this challenge, we propose the Segmentation-guided Frame-consistency Video +Diffusion Model (SF-VD) to generate large collections of labeled fluoroscopy +videos, augmenting the training data for wire segmentation networks. SF-VD +leverages videos with limited annotations by independently modeling scene +distribution and motion distribution. It first samples the scene distribution +by generating 2D fluoroscopy images with wires positioned according to a +specified input mask, and then samples the motion distribution by progressively +generating subsequent frames, ensuring frame-to-frame coherence through a +frame-consistency strategy. A segmentation-guided mechanism further refines the +process by adjusting wire contrast, ensuring a diverse range of visibility in +the synthesized image. Evaluation on a fluoroscopy dataset confirms the +superior quality of the generated videos and shows significant improvements in +guidewire segmentation. + +
+
+ comment: AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Interpret Your Decision: Logical Reasoning Regularization for + Generalization in Visual Classification NeurIPS2024 + + +
+ Vision models excel in image classification but struggle to generalize to +unseen data, such as classifying images from unseen domains or discovering +novel categories. In this paper, we explore the relationship between logical +reasoning and deep learning generalization in visual classification. A logical +regularization termed L-Reg is derived which bridges a logical analysis +framework to image classification. Our work reveals that L-Reg reduces the +complexity of the model in terms of the feature distribution and classifier +weights. Specifically, we unveil the interpretability brought by L-Reg, as it +enables the model to extract the salient features, such as faces to persons, +for classification. Theoretical analysis and experiments demonstrate that L-Reg +enhances generalization across various scenarios, including multi-domain +generalization and generalized category discovery. In complex real-world +scenarios where images span unknown classes and unseen domains, L-Reg +consistently improves generalization, highlighting its practical efficacy. + +
+
+ comment: Accepted by NeurIPS2024 as Spotlight +
+
+
+
+
+ + ♻ ☆ QOC: Quantum On-Chip Training with Parameter Shift and Gradient Pruning + + +
+ Parameterized Quantum Circuits (PQC) are drawing increasing research interest +thanks to its potential to achieve quantum advantages on near-term Noisy +Intermediate Scale Quantum (NISQ) hardware. In order to achieve scalable PQC +learning, the training process needs to be offloaded to real quantum machines +instead of using exponential-cost classical simulators. One common approach to +obtain PQC gradients is parameter shift whose cost scales linearly with the +number of qubits. We present QOC, the first experimental demonstration of +practical on-chip PQC training with parameter shift. Nevertheless, we find that +due to the significant quantum errors (noises) on real machines, gradients +obtained from naive parameter shift have low fidelity and thus degrading the +training accuracy. To this end, we further propose probabilistic gradient +pruning to firstly identify gradients with potentially large errors and then +remove them. Specifically, small gradients have larger relative errors than +large ones, thus having a higher probability to be pruned. We perform extensive +experiments with the Quantum Neural Network (QNN) benchmarks on 5 +classification tasks using 5 real quantum machines. The results demonstrate +that our on-chip training achieves over 90% and 60% accuracy for 2-class and +4-class image classification tasks. The probabilistic gradient pruning brings +up to 7% PQC accuracy improvements over no pruning. Overall, we successfully +obtain similar on-chip training accuracy compared with noise-free simulation +but have much better training scalability. The QOC code is available in the +TorchQuantum library. + +
+
+ comment: Published as a conference paper in DAC 2022; 7 pages, 8 figures; + open-source at https://github.com/mit-han-lab/torchquantum +
+
+
+
+
+ + ♻ ☆ BioTrove: A Large Curated Image Dataset Enabling AI for Biodiversity + + +
+ We introduce BioTrove, the largest publicly accessible dataset designed to +advance AI applications in biodiversity. Curated from the iNaturalist platform +and vetted to include only research-grade data, BioTrove contains 161.9 million +images, offering unprecedented scale and diversity from three primary kingdoms: +Animalia ("animals"), Fungi ("fungi"), and Plantae ("plants"), spanning +approximately 366.6K species. Each image is annotated with scientific names, +taxonomic hierarchies, and common names, providing rich metadata to support +accurate AI model development across diverse species and ecosystems. + We demonstrate the value of BioTrove by releasing a suite of CLIP models +trained using a subset of 40 million captioned images, known as BioTrove-Train. +This subset focuses on seven categories within the dataset that are +underrepresented in standard image recognition models, selected for their +critical role in biodiversity and agriculture: Aves ("birds"), Arachnida +("spiders/ticks/mites"), Insecta ("insects"), Plantae ("plants"), Fungi +("fungi"), Mollusca ("snails"), and Reptilia ("snakes/lizards"). To support +rigorous assessment, we introduce several new benchmarks and report model +accuracy for zero-shot learning across life stages, rare species, confounding +species, and multiple taxonomic levels. + We anticipate that BioTrove will spur the development of AI models capable of +supporting digital tools for pest control, crop monitoring, biodiversity +assessment, and environmental conservation. These advancements are crucial for +ensuring food security, preserving ecosystems, and mitigating the impacts of +climate change. BioTrove is publicly available, easily accessible, and ready +for immediate use. + +
+
+
+
+
+ + ♻ ☆ Slot-BERT: Self-supervised Object Discovery in Surgical Video + + +
+ Object-centric slot attention is a powerful framework for unsupervised +learning of structured and explainable representations that can support +reasoning about objects and actions, including in surgical videos. While +conventional object-centric methods for videos leverage recurrent processing to +achieve efficiency, they often struggle with maintaining long-range temporal +coherence required for long videos in surgical applications. On the other hand, +fully parallel processing of entire videos enhances temporal consistency but +introduces significant computational overhead, making it impractical for +implementation on hardware in medical facilities. We present Slot-BERT, a +bidirectional long-range model that learns object-centric representations in a +latent space while ensuring robust temporal coherence. Slot-BERT scales object +discovery seamlessly to long videos of unconstrained lengths. A novel slot +contrastive loss further reduces redundancy and improves the representation +disentanglement by enhancing slot orthogonality. We evaluate Slot-BERT on +real-world surgical video datasets from abdominal, cholecystectomy, and +thoracic procedures. Our method surpasses state-of-the-art object-centric +approaches under unsupervised training achieving superior performance across +diverse domains. We also demonstrate efficient zero-shot domain adaptation to +data from diverse surgical specialties and databases. + +
+
+
+
+
+ + ♻ ☆ JAM: A Comprehensive Model for Age Estimation, Verification, and + Comparability + + +
+ This paper introduces a comprehensive model for age estimation, verification, +and comparability, offering a comprehensive solution for a wide range of +applications. It employs advanced learning techniques to understand age +distribution and uses confidence scores to create probabilistic age ranges, +enhancing its ability to handle ambiguous cases. The model has been tested on +both proprietary and public datasets and compared against one of the +top-performing models in the field. Additionally, it has recently been +evaluated by NIST as part of the FATE challenge, achieving top places in many +categories. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 10 + +
+
+
+ + ☆ AirIO: Learning Inertial Odometry with Enhanced IMU Feature + Observability + + +
+ Inertial odometry (IO) using only Inertial Measurement Units (IMUs) offers a +lightweight and cost-effective solution for Unmanned Aerial Vehicle (UAV) +applications, yet existing learning-based IO models often fail to generalize to +UAVs due to the highly dynamic and non-linear-flight patterns that differ from +pedestrian motion. In this work, we identify that the conventional practice of +transforming raw IMU data to global coordinates undermines the observability of +critical kinematic information in UAVs. By preserving the body-frame +representation, our method achieves substantial performance improvements, with +a 66.7% average increase in accuracy across three datasets. Furthermore, +explicitly encoding attitude information into the motion network results in an +additional 23.8% improvement over prior results. Combined with a data-driven +IMU correction model (AirIMU) and an uncertainty-aware Extended Kalman Filter +(EKF), our approach ensures robust state estimation under aggressive UAV +maneuvers without relying on external sensors or control inputs. Notably, our +method also demonstrates strong generalizability to unseen data not included in +the training set, underscoring its potential for real-world UAV applications. + +
+
+
+
+
+ + ☆ Your Learned Constraint is Secretly a Backward Reachable Tube + + +
+ Inverse Constraint Learning (ICL) is the problem of inferring constraints +from safe (i.e., constraint-satisfying) demonstrations. The hope is that these +inferred constraints can then be used downstream to search for safe policies +for new tasks and, potentially, under different dynamics. Our paper explores +the question of what mathematical entity ICL recovers. Somewhat surprisingly, +we show that both in theory and in practice, ICL recovers the set of states +where failure is inevitable, rather than the set of states where failure has +already happened. In the language of safe control, this means we recover a +backwards reachable tube (BRT) rather than a failure set. In contrast to the +failure set, the BRT depends on the dynamics of the data collection system. We +discuss the implications of the dynamics-conditionedness of the recovered +constraint on both the sample-efficiency of policy search and the +transferability of learned constraints. + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ☆ Diffusion-Based Planning for Autonomous Driving with Flexible Guidance + + +
+ Achieving human-like driving behaviors in complex open-world environments is +a critical challenge in autonomous driving. Contemporary learning-based +planning approaches such as imitation learning methods often struggle to +balance competing objectives and lack of safety assurance,due to limited +adaptability and inadequacy in learning complex multi-modal behaviors commonly +exhibited in human planning, not to mention their strong reliance on the +fallback strategy with predefined rules. We propose a novel transformer-based +Diffusion Planner for closed-loop planning, which can effectively model +multi-modal driving behavior and ensure trajectory quality without any +rule-based refinement. Our model supports joint modeling of both prediction and +planning tasks under the same architecture, enabling cooperative behaviors +between vehicles. Moreover, by learning the gradient of the trajectory score +function and employing a flexible classifier guidance mechanism, Diffusion +Planner effectively achieves safe and adaptable planning behaviors. Evaluations +on the large-scale real-world autonomous planning benchmark nuPlan and our +newly collected 200-hour delivery-vehicle driving dataset demonstrate that +Diffusion Planner achieves state-of-the-art closed-loop performance with robust +transferability in diverse driving styles. + +
+
+
+
+
+ + ☆ Unveiling the Potential of iMarkers: Invisible Fiducial Markers for + Advanced Robotics + + +
+ Fiducial markers are widely used in various robotics tasks, facilitating +enhanced navigation, object recognition, and scene understanding. Despite their +advantages for robots and Augmented Reality (AR) applications, they often +disrupt the visual aesthetics of environments because they are visible to +humans, making them unsuitable for non-intrusive use cases. To address this +gap, this paper presents "iMarkers"-innovative, unobtrusive fiducial markers +detectable exclusively by robots equipped with specialized sensors. These +markers offer high flexibility in production, allowing customization of their +visibility range and encoding algorithms to suit various demands. The paper +also introduces the hardware designs and software algorithms developed for +detecting iMarkers, highlighting their adaptability and robustness in the +detection and recognition stages. Various evaluations have demonstrated the +effectiveness of iMarkers compared to conventional (printed) and blended +fiducial markers and confirmed their applicability in diverse robotics +scenarios. + +
+
+ comment: 12 pages, 10 figures, 2 tables +
+
+
+
+
+ + ☆ FAVbot: An Autonomous Target Tracking Micro-Robot with Frequency + Actuation Control + + +
+ Robotic autonomy at centimeter scale requires compact and +miniaturization-friendly actuation integrated with sensing and neural network +processing assembly within a tiny form factor. Applications of such systems +have witnessed significant advancements in recent years in fields such as +healthcare, manufacturing, and post-disaster rescue. The system design at this +scale puts stringent constraints on power consumption for both the sensory +front-end and actuation back-end and the weight of the electronic assembly for +robust operation. In this paper, we introduce FAVbot, the first autonomous +mobile micro-robotic system integrated with a novel actuation mechanism and +convolutional neural network (CNN) based computer vision - all integrated +within a compact 3-cm form factor. The novel actuation mechanism utilizes +mechanical resonance phenomenon to achieve frequency-controlled steering with a +single piezoelectric actuator. Experimental results demonstrate the +effectiveness of FAVbot's frequency-controlled actuation, which offers a +diverse selection of resonance modes with different motion characteristics. The +actuation system is complemented with the vision front-end where a camera along +with a microcontroller supports object detection for closed-loop control and +autonomous target tracking. This enables adaptive navigation in dynamic +environments. This work contributes to the evolving landscape of neural +network-enabled micro-robotic systems showing the smallest autonomous robot +built using controllable multi-directional single-actuator mechanism. + +
+
+ comment: This paper is under consideration for journal publication. Authors + reserve the right to transfer copyright without notice +
+
+
+
+
+ + ☆ Bridging the Sim2Real Gap: Vision Encoder Pre-Training for Visuomotor + Policy Transfer + + +
+ Simulation offers a scalable and efficient alternative to real-world data +collection for learning visuomotor robotic policies. However, the +simulation-to-reality, or "Sim2Real" distribution shift -- introduced by +employing simulation-trained policies in real-world environments -- frequently +prevents successful policy transfer. This study explores the potential of using +large-scale pre-training of vision encoders to address the Sim2Real gap. We +examine a diverse collection of encoders, evaluating their ability to (1) +extract features necessary for robot control while (2) remaining invariant to +task-irrelevant environmental variations. We quantitatively measure the +encoder's feature extraction capabilities through linear probing and its domain +invariance by computing distances between simulation and real-world embedding +centroids. Additional qualitative insights are provided through t-SNE plots and +GradCAM saliency maps. Findings suggest that encoders pre-trained on +manipulation-specific datasets generally outperform those trained on generic +datasets in bridging the Sim2Real gap. +https://github.com/yyardi/Bridging-the-Sim2Real-Gap + +
+
+ comment: 9 pages, 10 figures, view GitHub for all appendix figures from the + study +
+
+
+
+
+ + ♻ ☆ Performance Assessment of Lidar Odometry Frameworks: A Case Study at the + Australian Botanic Garden Mount Annan + + +
+ Autonomous vehicles are being tested in diverse environments worldwide. +However, a notable gap exists in evaluating datasets representing natural, +unstructured environments such as forests or gardens. To address this, we +present a study on localisation at the Australian Botanic Garden Mount Annan. +This area encompasses open grassy areas, paved pathways, and densely vegetated +sections with trees and other objects. The dataset was recorded using a +128-beam LiDAR sensor and GPS and IMU readings to track the ego-vehicle. This +paper evaluates the performance of two state-of-the-art LiDARinertial odometry +frameworks, COIN-LIO and LIO-SAM, on this dataset. We analyse trajectory +estimates in both horizontal and vertical dimensions and assess relative +translation and yaw errors over varying distances. Our findings reveal that +while both frameworks perform adequately in the vertical plane, COINLIO +demonstrates superior accuracy in the horizontal plane, particularly over +extended trajectories. In contrast, LIO-SAM shows increased drift and yaw +errors over longer distances. + +
+
+ comment: The 2024 Australasian Conference on Robotics and Automation (ACRA + 2024) +
+
+
+
+
+ + ♻ ☆ Multimodal and Force-Matched Imitation Learning with a See-Through + Visuotactile Sensor + + +
+ Contact-rich tasks continue to present many challenges for robotic +manipulation. In this work, we leverage a multimodal visuotactile sensor within +the framework of imitation learning (IL) to perform contact-rich tasks that +involve relative motion (e.g., slipping and sliding) between the end-effector +and the manipulated object. We introduce two algorithmic contributions, tactile +force matching and learned mode switching, as complimentary methods for +improving IL. Tactile force matching enhances kinesthetic teaching by reading +approximate forces during the demonstration and generating an adapted robot +trajectory that recreates the recorded forces. Learned mode switching uses IL +to couple visual and tactile sensor modes with the learned motion policy, +simplifying the transition from reaching to contacting. We perform robotic +manipulation experiments on four door-opening tasks with a variety of +observation and algorithm configurations to study the utility of multimodal +visuotactile sensing and our proposed improvements. Our results show that the +inclusion of force matching raises average policy success rates by 62.5%, +visuotactile mode switching by 30.3%, and visuotactile data as a policy input +by 42.5%, emphasizing the value of see-through tactile sensing for IL, both for +data collection to allow force matching, and for policy execution to enable +accurate task feedback. Project site: https://papers.starslab.ca/sts-il/ + +
+
+ comment: 14 pages, 22 figures +
+
+
+
+
+ + ♻ ☆ Sensor-Based Distributionally Robust Control for Safe Robot Navigation + in Dynamic Environments + + +
+ We introduce a novel method for mobile robot navigation in dynamic, unknown +environments, leveraging onboard sensing and distributionally robust +optimization to impose probabilistic safety constraints. Our method introduces +a distributionally robust control barrier function (DR-CBF) that directly +integrates noisy sensor measurements and state estimates to define safety +constraints. This approach is applicable to a wide range of control-affine +dynamics, generalizable to robots with complex geometries, and capable of +operating at real-time control frequencies. Coupled with a control Lyapunov +function (CLF) for path following, the proposed CLF-DR-CBF control synthesis +method achieves safe, robust, and efficient navigation in challenging +environments. We demonstrate the effectiveness and robustness of our approach +for safe autonomous navigation under uncertainty in simulations and real-world +experiments with differential-drive robots. + +
+
+ comment: Project page: https://existentialrobotics.org/DRO_Safe_Navigation +
+
+
+
+
+ + ♻ ☆ Distributed Model Predictive Covariance Steering + + +
+ This paper proposes Distributed Model Predictive Covariance Steering (DiMPCS) +for multi-agent control under stochastic uncertainty. The scope of our approach +is to blend covariance steering theory, distributed optimization and model +predictive control (MPC) into a single framework that is safe, scalable and +decentralized. Initially, we pose a problem formulation that uses the +Wasserstein distance to steer the state distributions of a multi-agent system +to desired targets, and probabilistic constraints to ensure safety. We then +transform this problem into a finite-dimensional optimization one by utilizing +a disturbance feedback policy parametrization for covariance steering and a +tractable approximation of the safety constraints. To solve the latter problem, +we derive a decentralized consensus-based algorithm using the Alternating +Direction Method of Multipliers. This method is then extended to a receding +horizon form, which yields the proposed DiMPCS algorithm. Simulation +experiments on a variety of multi-robot tasks with up to hundreds of robots +demonstrate the effectiveness of DiMPCS. The superior scalability and +performance of the proposed method is also highlighted through a comparison +against related stochastic MPC approaches. Finally, hardware results on a +multi-robot platform also verify the applicability of DiMPCS on real systems. A +video with all results is available in https://youtu.be/tzWqOzuj2kQ. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 16 + +
+
+
+ + ☆ MimicGait: A Model Agnostic approach for Occluded Gait Recognition using + Correlational Knowledge Distillation + + +
+ Gait recognition is an important biometric technique over large distances. +State-of-the-art gait recognition systems perform very well in controlled +environments at close range. Recently, there has been an increased interest in +gait recognition in the wild prompted by the collection of outdoor, more +challenging datasets containing variations in terms of illumination, pitch +angles, and distances. An important problem in these environments is that of +occlusion, where the subject is partially blocked from camera view. While +important, this problem has received little attention. Thus, we propose +MimicGait, a model-agnostic approach for gait recognition in the presence of +occlusions. We train the network using a multi-instance correlational +distillation loss to capture both inter-sequence and intra-sequence +correlations in the occluded gait patterns of a subject, utilizing an auxiliary +Visibility Estimation Network to guide the training of the proposed mimic +network. We demonstrate the effectiveness of our approach on challenging +real-world datasets like GREW, Gait3D and BRIAR. We release the code in +https://github.com/Ayush-00/mimicgait. + +
+
+ comment: Accepted to WACV 2025 as Poster +
+
+
+
+
+ + ☆ Marker Track: Accurate Fiducial Marker Tracking for Evaluation of + Residual Motions During Breath-Hold Radiotherapy + + +
+ Fiducial marker positions in projection image of cone-beam computed +tomography (CBCT) scans have been studied to evaluate daily residual motion +during breath-hold radiation therapy. Fiducial marker migration posed +challenges in accurately locating markers, prompting the development of a novel +algorithm that reconstructs volumetric probability maps of marker locations +from filtered gradient maps of projections. This guides the development of a +Python-based algorithm to detect fiducial markers in projection images using +Meta AI's Segment Anything Model 2 (SAM 2). Retrospective data from a +pancreatic cancer patient with two fiducial markers were analyzed. The +three-dimensional (3D) marker positions from simulation computed tomography +(CT) were compared to those reconstructed from CBCT images, revealing a +decrease in relative distances between markers over time. Fiducial markers were +successfully detected in 2777 out of 2786 projection frames. The average +standard deviation of superior-inferior (SI) marker positions was 0.56 mm per +breath-hold, with differences in average SI positions between two breath-holds +in the same scan reaching up to 5.2 mm, and a gap of up to 7.3 mm between the +end of the first and beginning of the second breath-hold. 3D marker positions +were calculated using projection positions and confirmed marker migration. This +method effectively calculates marker probability volume and enables accurate +fiducial marker tracking during treatment without requiring any specialized +equipment, additional radiation doses, or manual initialization and labeling. +It has significant potential for automatically assessing daily residual motion +to adjust planning margins, functioning as an adaptive radiation therapy tool. + +
+
+ comment: 14 pages, 9 figures, Regeneron STS 2025 project. Project page: + https://sites.google.com/view/markertrack?usp=sharing +
+
+
+
+
+ + ☆ AirIO: Learning Inertial Odometry with Enhanced IMU Feature + Observability + + +
+ Inertial odometry (IO) using only Inertial Measurement Units (IMUs) offers a +lightweight and cost-effective solution for Unmanned Aerial Vehicle (UAV) +applications, yet existing learning-based IO models often fail to generalize to +UAVs due to the highly dynamic and non-linear-flight patterns that differ from +pedestrian motion. In this work, we identify that the conventional practice of +transforming raw IMU data to global coordinates undermines the observability of +critical kinematic information in UAVs. By preserving the body-frame +representation, our method achieves substantial performance improvements, with +a 66.7% average increase in accuracy across three datasets. Furthermore, +explicitly encoding attitude information into the motion network results in an +additional 23.8% improvement over prior results. Combined with a data-driven +IMU correction model (AirIMU) and an uncertainty-aware Extended Kalman Filter +(EKF), our approach ensures robust state estimation under aggressive UAV +maneuvers without relying on external sensors or control inputs. Notably, our +method also demonstrates strong generalizability to unseen data not included in +the training set, underscoring its potential for real-world UAV applications. + +
+
+
+
+
+ + ☆ Classifying Deepfakes Using Swin Transformers + + +
+ The proliferation of deepfake technology poses significant challenges to the +authenticity and trustworthiness of digital media, necessitating the +development of robust detection methods. This study explores the application of +Swin Transformers, a state-of-the-art architecture leveraging shifted windows +for self-attention, in detecting and classifying deepfake images. Using the +Real and Fake Face Detection dataset by Yonsei University's Computational +Intelligence Photography Lab, we evaluate the Swin Transformer and hybrid +models such as Swin-ResNet and Swin-KNN, focusing on their ability to identify +subtle manipulation artifacts. Our results demonstrate that the Swin +Transformer outperforms conventional CNN-based architectures, including VGG16, +ResNet18, and AlexNet, achieving a test accuracy of 71.29\%. Additionally, we +present insights into hybrid model design, highlighting the complementary +strengths of transformer and CNN-based approaches in deepfake detection. This +study underscores the potential of transformer-based architectures for +improving accuracy and generalizability in image-based manipulation detection, +paving the way for more effective countermeasures against deepfake threats. + +
+
+ comment: 3 pages +
+
+
+
+
+ + ☆ A Privacy Enhancing Technique to Evade Detection by Street Video Cameras + Without Using Adversarial Accessories + + +
+ In this paper, we propose a privacy-enhancing technique leveraging an +inherent property of automatic pedestrian detection algorithms, namely, that +the training of deep neural network (DNN) based methods is generally performed +using curated datasets and laboratory settings, while the operational areas of +these methods are dynamic real-world environments. In particular, we leverage a +novel side effect of this gap between the laboratory and the real world: +location-based weakness in pedestrian detection. We demonstrate that the +position (distance, angle, height) of a person, and ambient light level, +directly impact the confidence of a pedestrian detector when detecting the +person. We then demonstrate that this phenomenon is present in pedestrian +detectors observing a stationary scene of pedestrian traffic, with blind spot +areas of weak detection of pedestrians with low confidence. We show how +privacy-concerned pedestrians can leverage these blind spots to evade detection +by constructing a minimum confidence path between two points in a scene, +reducing the maximum confidence and average confidence of the path by up to +0.09 and 0.13, respectively, over direct and random paths through the scene. To +counter this phenomenon, and force the use of more costly and sophisticated +methods to leverage this vulnerability, we propose a novel countermeasure to +improve the confidence of pedestrian detectors in blind spots, raising the +max/average confidence of paths generated by our technique by 0.09 and 0.05, +respectively. In addition, we demonstrate that our countermeasure improves a +Faster R-CNN-based pedestrian detector's TPR and average true positive +confidence by 0.03 and 0.15, respectively. + +
+
+
+
+
+ + ☆ Can Pose Transfer Models Generate Realistic Human Motion? + + +
+ Recent pose-transfer methods aim to generate temporally consistent and fully +controllable videos of human action where the motion from a reference video is +reenacted by a new identity. We evaluate three state-of-the-art pose-transfer +methods -- AnimateAnyone, MagicAnimate, and ExAvatar -- by generating videos +with actions and identities outside the training distribution and conducting a +participant study about the quality of these videos. In a controlled +environment of 20 distinct human actions, we find that participants, presented +with the pose-transferred videos, correctly identify the desired action only +42.92% of the time. Moreover, the participants find the actions in the +generated videos consistent with the reference (source) videos only 36.46% of +the time. These results vary by method: participants find the splatting-based +ExAvatar more consistent and photorealistic than the diffusion-based +AnimateAnyone and MagicAnimate. + +
+
+ comment: Data and code available at + https://github.com/matyasbohacek/pose-transfer-human-motion +
+
+
+
+
+ + ☆ Bringing Characters to New Stories: Training-Free Theme-Specific Image + Generation via Dynamic Visual Prompting + + +
+ The stories and characters that captivate us as we grow up shape unique +fantasy worlds, with images serving as the primary medium for visually +experiencing these realms. Personalizing generative models through fine-tuning +with theme-specific data has become a prevalent approach in text-to-image +generation. However, unlike object customization, which focuses on learning +specific objects, theme-specific generation encompasses diverse elements such +as characters, scenes, and objects. Such diversity also introduces a key +challenge: how to adaptively generate multi-character, multi-concept, and +continuous theme-specific images (TSI). Moreover, fine-tuning approaches often +come with significant computational overhead, time costs, and risks of +overfitting. This paper explores a fundamental question: Can image generation +models directly leverage images as contextual input, similarly to how large +language models use text as context? To address this, we present T-Prompter, a +novel training-free TSI method for generation. T-Prompter introduces visual +prompting, a mechanism that integrates reference images into generative models, +allowing users to seamlessly specify the target theme without requiring +additional training. To further enhance this process, we propose a Dynamic +Visual Prompting (DVP) mechanism, which iteratively optimizes visual prompts to +improve the accuracy and quality of generated images. Our approach enables +diverse applications, including consistent story generation, character design, +realistic character generation, and style-guided image generation. Comparative +evaluations against state-of-the-art personalization methods demonstrate that +T-Prompter achieves significantly better results and excels in maintaining +character identity preserving, style consistency and text alignment, offering a +robust and flexible solution for theme-specific image generation. + +
+
+
+
+
+ + ☆ GaussianToken: An Effective Image Tokenizer with 2D Gaussian Splatting + + +
+ Effective image tokenization is crucial for both multi-modal understanding +and generation tasks due to the necessity of the alignment with discrete text +data. To this end, existing approaches utilize vector quantization (VQ) to +project pixels onto a discrete codebook and reconstruct images from the +discrete representation. However, compared with the continuous latent space, +the limited discrete codebook space significantly restrict the representational +ability of these image tokenizers. In this paper, we propose GaussianToken: An +Effective Image Tokenizer with 2D Gaussian Splatting as a solution. We first +represent the encoded samples as multiple flexible featured 2D Gaussians +characterized by positions, rotation angles, scaling factors, and feature +coefficients. We adopt the standard quantization for the Gaussian features and +then concatenate the quantization results with the other intrinsic Gaussian +parameters before the corresponding splatting operation and the subsequent +decoding module. In general, GaussianToken integrates the local influence of 2D +Gaussian distribution into the discrete space and thus enhances the +representation capability of the image tokenizer. Competitive reconstruction +performances on CIFAR, Mini-ImageNet, and ImageNet-1K demonstrate the +effectiveness of our framework. Our code is available at: +https://github.com/ChrisDong-THU/GaussianToken. + +
+
+
+
+
+ + ☆ IPVTON: Image-based 3D Virtual Try-on with Image Prompt Adapter + + +
+ Given a pair of images depicting a person and a garment separately, +image-based 3D virtual try-on methods aim to reconstruct a 3D human model that +realistically portrays the person wearing the desired garment. In this paper, +we present IPVTON, a novel image-based 3D virtual try-on framework. IPVTON +employs score distillation sampling with image prompts to optimize a hybrid 3D +human representation, integrating target garment features into diffusion priors +through an image prompt adapter. To avoid interference with non-target areas, +we leverage mask-guided image prompt embeddings to focus the image features on +the try-on regions. Moreover, we impose geometric constraints on the 3D model +with a pseudo silhouette generated by ControlNet, ensuring that the clothed 3D +human model retains the shape of the source identity while accurately wearing +the target garments. Extensive qualitative and quantitative experiments +demonstrate that IPVTON outperforms previous methods in image-based 3D virtual +try-on tasks, excelling in both geometry and texture. + +
+
+
+
+
+ + ☆ Radiologist-in-the-Loop Self-Training for Generalizable CT Metal + Artifact Reduction + + +
+ Metal artifacts in computed tomography (CT) images can significantly degrade +image quality and impede accurate diagnosis. Supervised metal artifact +reduction (MAR) methods, trained using simulated datasets, often struggle to +perform well on real clinical CT images due to a substantial domain gap. +Although state-of-the-art semi-supervised methods use pseudo ground-truths +generated by a prior network to mitigate this issue, their reliance on a fixed +prior limits both the quality and quantity of these pseudo ground-truths, +introducing confirmation bias and reducing clinical applicability. To address +these limitations, we propose a novel Radiologist-In-the-loop SElf-training +framework for MAR, termed RISE-MAR, which can integrate radiologists' feedback +into the semi-supervised learning process, progressively improving the quality +and quantity of pseudo ground-truths for enhanced generalization on real +clinical CT images. For quality assurance, we introduce a clinical quality +assessor model that emulates radiologist evaluations, effectively selecting +high-quality pseudo ground-truths for semi-supervised training. For quantity +assurance, our self-training framework iteratively generates additional +high-quality pseudo ground-truths, expanding the clinical dataset and further +improving model generalization. Extensive experimental results on multiple +clinical datasets demonstrate the superior generalization performance of our +RISE-MAR over state-of-the-art methods, advancing the development of MAR models +for practical application. Code is available at +https://github.com/Masaaki-75/rise-mar. + +
+
+ comment: IEEE TMI 2025 +
+
+
+
+
+ + ☆ Advancing TDFN: Precise Fixation Point Generation Using Reconstruction + Differences + + +
+ Wang and Wang (2025) proposed the Task-Driven Fixation Network (TDFN) based +on the fixation mechanism, which leverages low-resolution information along +with high-resolution details near fixation points to accomplish specific visual +tasks. The model employs reinforcement learning to generate fixation points. +However, training reinforcement learning models is challenging, particularly +when aiming to generate pixel-level accurate fixation points on high-resolution +images. This paper introduces an improved fixation point generation method by +leveraging the difference between the reconstructed image and the input image +to train the fixation point generator. This approach directs fixation points to +areas with significant differences between the reconstructed and input images. +Experimental results demonstrate that this method achieves highly accurate +fixation points, significantly enhances the network's classification accuracy, +and reduces the average number of required fixations to achieve a predefined +accuracy level. + +
+
+ comment: 9 pages, 5 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Slow Perception: Let's Perceive Geometric Figures Step-by-step + + +
+ Recently, "visual o1" began to enter people's vision, with expectations that +this slow-thinking design can solve visual reasoning tasks, especially +geometric math problems. However, the reality is that current LVLMs (Large +Vision Language Models) can hardly even accurately copy a geometric figure, let +alone truly understand the complex inherent logic and spatial relationships +within geometric shapes. We believe accurate copying (strong perception) is the +first step to visual o1. Accordingly, we introduce the concept of "slow +perception" (SP), which guides the model to gradually perceive basic point-line +combinations, as our humans, reconstruct complex geometric structures +progressively. There are two-fold stages in SP: a) perception decomposition. +Perception is not instantaneous. In this stage, complex geometric figures are +broken down into basic simple units to unify geometry representation. b) +perception flow, which acknowledges that accurately tracing a line is not an +easy task. This stage aims to avoid "long visual jumps" in regressing line +segments by using a proposed "perceptual ruler" to trace each line +stroke-by-stroke. Surprisingly, such a human-like perception manner enjoys an +inference time scaling law -- the slower, the better. Researchers strive to +speed up the model's perception in the past, but we slow it down again, +allowing the model to read the image step-by-step and carefully. + +
+
+
+
+
+ + ♻ ☆ Hierarchical LoG Bayesian Neural Network for Enhanced Aorta Segmentation + + +
+ Accurate segmentation of the aorta and its associated arch branches is +crucial for diagnosing aortic diseases. While deep learning techniques have +significantly improved aorta segmentation, they remain challenging due to the +intricate multiscale structure and the complexity of the surrounding tissues. +This paper presents a novel approach for enhancing aorta segmentation using a +Bayesian neural network-based hierarchical Laplacian of Gaussian (LoG) model. +Our model consists of a 3D U-Net stream and a hierarchical LoG stream: the +former provides an initial aorta segmentation, and the latter enhances blood +vessel detection across varying scales by learning suitable LoG kernels, +enabling self-adaptive handling of different parts of the aorta vessels with +significant scale differences. We employ a Bayesian method to parameterize the +LoG stream and provide confidence intervals for the segmentation results, +ensuring robustness and reliability of the prediction for vascular medical +image analysts. Experimental results show that our model can accurately segment +main and supra-aortic vessels, yielding at least a 3% gain in the Dice +coefficient over state-of-the-art methods across multiple volumes drawn from +two aorta datasets, and can provide reliable confidence intervals for different +parts of the aorta. The code is available at https://github.com/adlsn/LoGBNet. + +
+
+
+
+
+ + ♻ ☆ COLA: COarse-LAbel multi-source LiDAR semantic segmentation for + autonomous driving + + +
+ LiDAR semantic segmentation for autonomous driving has been a growing field +of interest in recent years. Datasets and methods have appeared and expanded +very quickly, but methods have not been updated to exploit this new data +availability and rely on the same classical datasets. Different ways of +performing LIDAR semantic segmentation training and inference can be divided +into several subfields, which include the following: domain generalization, +source-to-source segmentation, and pre-training. In this work, we aim to +improve results in all of these subfields with the novel approach of +multi-source training. Multi-source training relies on the availability of +various datasets at training time. To overcome the common obstacles in +multi-source training, we introduce the coarse labels and call the newly +created multi-source dataset COLA. We propose three applications of this new +dataset that display systematic improvement over single-source strategies: +COLA-DG for domain generalization (+10%), COLA-S2S for source-to-source +segmentation (+5.3%), and COLA-PT for pre-training (+12%). We demonstrate that +multi-source approaches bring systematic improvement over single-source +approaches. + +
+
+
+
+
+ + ♻ ☆ Reflecting Reality: Enabling Diffusion Models to Produce Faithful Mirror + Reflections 3DV 2025 + + +
+ We tackle the problem of generating highly realistic and plausible mirror +reflections using diffusion-based generative models. We formulate this problem +as an image inpainting task, allowing for more user control over the placement +of mirrors during the generation process. To enable this, we create SynMirror, +a large-scale dataset of diverse synthetic scenes with objects placed in front +of mirrors. SynMirror contains around 198k samples rendered from 66k unique 3D +objects, along with their associated depth maps, normal maps and instance-wise +segmentation masks, to capture relevant geometric properties of the scene. +Using this dataset, we propose a novel depth-conditioned inpainting method +called MirrorFusion, which generates high-quality, realistic, shape and +appearance-aware reflections of real-world objects. MirrorFusion outperforms +state-of-the-art methods on SynMirror, as demonstrated by extensive +quantitative and qualitative analysis. To the best of our knowledge, we are the +first to successfully tackle the challenging problem of generating controlled +and faithful mirror reflections of an object in a scene using diffusion-based +models. SynMirror and MirrorFusion open up new avenues for image editing and +augmented reality applications for practitioners and researchers alike. The +project page is available at: +https://val.cds.iisc.ac.in/reflecting-reality.github.io/. + +
+
+ comment: Accepted to 3DV 2025. First two authors contributed equally. Project + Page: https://val.cds.iisc.ac.in/reflecting-reality.github.io/ +
+
+
+
+
+ + ♻ ☆ Adversarially Robust Out-of-Distribution Detection Using + Lyapunov-Stabilized Embeddings ICLR + + +
+ Despite significant advancements in out-of-distribution (OOD) detection, +existing methods still struggle to maintain robustness against adversarial +attacks, compromising their reliability in critical real-world applications. +Previous studies have attempted to address this challenge by exposing detectors +to auxiliary OOD datasets alongside adversarial training. However, the +increased data complexity inherent in adversarial training, and the myriad of +ways that OOD samples can arise during testing, often prevent these approaches +from establishing robust decision boundaries. To address these limitations, we +propose AROS, a novel approach leveraging neural ordinary differential +equations (NODEs) with Lyapunov stability theorem in order to obtain robust +embeddings for OOD detection. By incorporating a tailored loss function, we +apply Lyapunov stability theory to ensure that both in-distribution (ID) and +OOD data converge to stable equilibrium points within the dynamical system. +This approach encourages any perturbed input to return to its stable +equilibrium, thereby enhancing the model's robustness against adversarial +perturbations. To not use additional data, we generate fake OOD embeddings by +sampling from low-likelihood regions of the ID data feature space, +approximating the boundaries where OOD data are likely to reside. To then +further enhance robustness, we propose the use of an orthogonal binary layer +following the stable feature space, which maximizes the separation between the +equilibrium points of ID and OOD samples. We validate our method through +extensive experiments across several benchmarks, demonstrating superior +performance, particularly under adversarial attacks. Notably, our approach +improves robust detection performance from 37.8% to 80.1% on CIFAR-10 vs. +CIFAR-100 and from 29.0% to 67.0% on CIFAR-100 vs. CIFAR-10. + +
+
+ comment: Accepted at the International Conference on Learning Representations + (ICLR) 2025. Code and pre-trained models are available at + https://github.com/AdaptiveMotorControlLab/AROS +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 14 + +
+
+
+ + ☆ Safe and Agile Transportation of Cable-Suspended Payload via Multiple + Aerial Robots + + +
+ Transporting a heavy payload using multiple aerial robots (MARs) is an +efficient manner to extend the load capacity of a single aerial robot. However, +existing schemes for the multiple aerial robots transportation system (MARTS) +still lack the capability to generate a collision-free and dynamically feasible +trajectory in real-time and further track an agile trajectory especially when +there are no sensors available to measure the states of payload and cable. +Therefore, they are limited to low-agility transportation in simple +environments. To bridge the gap, we propose complete planning and control +schemes for the MARTS, achieving safe and agile aerial transportation (SAAT) of +a cable-suspended payload in complex environments. Flatness maps for the aerial +robot considering the complete kinematical constraint and the dynamical +coupling between each aerial robot and payload are derived. To improve the +responsiveness for the generation of the safe, dynamically feasible, and agile +trajectory in complex environments, a real-time spatio-temporal trajectory +planning scheme is proposed for the MARTS. Besides, we break away from the +reliance on the state measurement for both the payload and cable, as well as +the closed-loop control for the payload, and propose a fully distributed +control scheme to track the agile trajectory that is robust against imprecise +payload mass and non-point mass payload. The proposed schemes are extensively +validated through benchmark comparisons, ablation studies, and simulations. +Finally, extensive real-world experiments are conducted on a MARTS integrated +by three aerial robots with onboard computers and sensors. The result validates +the efficiency and robustness of our proposed schemes for SAAT in complex +environments. + +
+
+ comment: 20 pages, 14 figures, submitted to IEEE Transactions on Robotics +
+
+
+
+
+ + ☆ Zero-shot Robotic Manipulation with Language-guided Instruction and + Formal Task Planning + + +
+ Robotic manipulation is often challenging due to the long-horizon tasks and +the complex object relationships. A common solution is to develop a task and +motion planning framework that integrates planning for high-level task and +low-level motion. Recently, inspired by the powerful reasoning ability of Large +Language Models (LLMs), LLM-based planning approaches have achieved remarkable +progress. However, these methods still heavily rely on expert-specific +knowledge, often generating invalid plans for unseen and unfamiliar tasks. To +address this issue, we propose an innovative language-guided symbolic task +planning (LM-SymOpt) framework with optimization. It is the first expert-free +planning framework since we combine the world knowledge from LLMs with formal +reasoning, resulting in improved generalization capability to new tasks. +Specifically, differ to most existing work, our LM-SymOpt employs LLMs to +translate natural language instructions into symbolic representations, thereby +representing actions as high-level symbols and reducing the search space for +planning. Next, after evaluating the action probability of completing the task +using LLMs, a weighted random sampling method is introduced to generate +candidate plans. Their feasibility is assessed through symbolic reasoning and +their cost efficiency is then evaluated using trajectory optimization for +selecting the optimal planning. Our experimental results show that LM-SymOpt +outperforms existing LLM-based planning approaches. + +
+
+
+
+
+ + ☆ Towards Conscious Service Robots + + +
+ Deep learning's success in perception, natural language processing, etc. +inspires hopes for advancements in autonomous robotics. However, real-world +robotics face challenges like variability, high-dimensional state spaces, +non-linear dependencies, and partial observability. A key issue is +non-stationarity of robots, environments, and tasks, leading to performance +drops with out-of-distribution data. Unlike current machine learning models, +humans adapt quickly to changes and new tasks due to a cognitive architecture +that enables systematic generalization and meta-cognition. Human brain's System +1 handles routine tasks unconsciously, while System 2 manages complex tasks +consciously, facilitating flexible problem-solving and self-monitoring. For +robots to achieve human-like learning and reasoning, they need to integrate +causal models, working memory, planning, and metacognitive processing. By +incorporating human cognition insights, the next generation of service robots +will handle novel situations and monitor themselves to avoid risks and mitigate +errors. + +
+
+ comment: In: Science for a Better Tomorrow: Curious 2024 Insights Actions, + Springer 2025 +
+
+
+
+
+ + ☆ Extracting Forward Invariant Sets from Neural Network-Based Control + Barrier Functions + + +
+ Training Neural Networks (NNs) to serve as Barrier Functions (BFs) is a +popular way to improve the safety of autonomous dynamical systems. Despite +significant practical success, these methods are not generally guaranteed to +produce true BFs in a provable sense, which undermines their intended use as +safety certificates. In this paper, we consider the problem of formally +certifying a learned NN as a BF with respect to state avoidance for an +autonomous system: viz. computing a region of the state space on which the +candidate NN is provably a BF. In particular, we propose a sound algorithm that +efficiently produces such a certificate set for a shallow NN. Our algorithm +combines two novel approaches: it first uses NN reachability tools to identify +a subset of states for which the output of the NN does not increase along +system trajectories; then, it uses a novel enumeration algorithm for hyperplane +arrangements to find the intersection of the NN's zero-sub-level set with the +first set of states. In this way, our algorithm soundly finds a subset of +states on which the NN is certified as a BF. We further demonstrate the +effectiveness of our algorithm at certifying for real-world NNs as BFs in two +case studies. We complemented these with scalability experiments that +demonstrate the efficiency of our algorithm. + +
+
+
+
+
+ + ☆ Impact-resistant, autonomous robots inspired by tensegrity architecture + + +
+ Future robots will navigate perilous, remote environments with resilience and +autonomy. Researchers have proposed building robots with compliant bodies to +enhance robustness, but this approach often sacrifices the autonomous +capabilities expected of rigid robots. Inspired by tensegrity architecture, we +introduce a tensegrity robot -- a hybrid robot made from rigid struts and +elastic tendons -- that demonstrates the advantages of compliance and the +autonomy necessary for task performance. This robot boasts impact resistance +and autonomy in a field environment and additional advances in the state of the +art, including surviving harsh impacts from drops (at least 5.7 m), accurately +reconstructing its shape and orientation using on-board sensors, achieving high +locomotion speeds (18 bar lengths per minute), and climbing the steepest +incline of any tensegrity robot (28 degrees). We characterize the robot's +locomotion on unstructured terrain, showcase its autonomous capabilities in +navigation tasks, and demonstrate its robustness by rolling it off a cliff. + +
+
+
+
+
+ + ☆ Understanding via Gaze: Gaze-based Task Decomposition for Imitation + Learning of Robot Manipulation + + +
+ In imitation learning for robotic manipulation, decomposing object +manipulation tasks into multiple semantic actions is essential. This +decomposition enables the reuse of learned skills in varying contexts and the +combination of acquired skills to perform novel tasks, rather than merely +replicating demonstrated motions. Gaze, an evolutionary tool for understanding +ongoing events, plays a critical role in human object manipulation, where it +strongly correlates with motion planning. In this study, we propose a simple +yet robust task decomposition method based on gaze transitions. We hypothesize +that an imitation agent's gaze control, fixating on specific landmarks and +transitioning between them, naturally segments demonstrated manipulations into +sub-tasks. Notably, our method achieves consistent task decomposition across +all demonstrations, which is desirable in contexts such as machine learning. +Using teleoperation, a common modality in imitation learning for robotic +manipulation, we collected demonstration data for various tasks, applied our +segmentation method, and evaluated the characteristics and consistency of the +resulting sub-tasks. Furthermore, through extensive testing across a wide range +of hyperparameter variations, we demonstrated that the proposed method +possesses the robustness necessary for application to different robotic +systems. + +
+
+
+
+
+ + ☆ An Atomic Skill Library Construction Method for Data-Efficient Embodied + Manipulation + + +
+ Embodied manipulation is a fundamental ability in the realm of embodied +artificial intelligence. Although current embodied manipulation models show +certain generalizations in specific settings, they struggle in new environments +and tasks due to the complexity and diversity of real-world scenarios. The +traditional end-to-end data collection and training manner leads to significant +data demands, which we call ``data explosion''. To address the issue, we +introduce a three-wheeled data-driven method to build an atomic skill library. +We divide tasks into subtasks using the Vision-Language Planning (VLP). Then, +atomic skill definitions are formed by abstracting the subtasks. Finally, an +atomic skill library is constructed via data collection and +Vision-Language-Action (VLA) fine-tuning. As the atomic skill library expands +dynamically with the three-wheel update strategy, the range of tasks it can +cover grows naturally. In this way, our method shifts focus from end-to-end +tasks to atomic skills, significantly reducing data costs while maintaining +high performance and enabling efficient adaptation to new tasks. Extensive +experiments in real-world settings demonstrate the effectiveness and efficiency +of our approach. + +
+
+
+
+
+ + ☆ Extensive Exploration in Complex Traffic Scenarios using Hierarchical + Reinforcement Learning + + +
+ Developing an automated driving system capable of navigating complex traffic +environments remains a formidable challenge. Unlike rule-based or supervised +learning-based methods, Deep Reinforcement Learning (DRL) based controllers +eliminate the need for domain-specific knowledge and datasets, thus providing +adaptability to various scenarios. Nonetheless, a common limitation of existing +studies on DRL-based controllers is their focus on driving scenarios with +simple traffic patterns, which hinders their capability to effectively handle +complex driving environments with delayed, long-term rewards, thus compromising +the generalizability of their findings. In response to these limitations, our +research introduces a pioneering hierarchical framework that efficiently +decomposes intricate decision-making problems into manageable and interpretable +subtasks. We adopt a two step training process that trains the high-level +controller and low-level controller separately. The high-level controller +exhibits an enhanced exploration potential with long-term delayed rewards, and +the low-level controller provides longitudinal and lateral control ability +using short-term instantaneous rewards. Through simulation experiments, we +demonstrate the superiority of our hierarchical controller in managing complex +highway driving situations. + +
+
+
+
+
+ + ♻ ☆ Optimal DLT-based Solutions for the Perspective-n-Point + + +
+ We propose a modified normalized direct linear transform (DLT) algorithm for +solving the perspective-n-point (PnP) problem with much better behavior than +the conventional DLT. The modification consists of analytically weighting the +different measurements in the linear system with a negligible increase in +computational load. Our approach exhibits clear improvements -- in both +performance and runtime -- when compared to popular methods such as EPnP, CPnP, +RPnP, and OPnP. Our new non-iterative solution approaches that of the true +optimal found via Gauss-Newton optimization, but at a fraction of the +computational cost. Our optimal DLT (oDLT) implementation, as well as the +experiments, are released in open source. + +
+
+ comment: 8 pages, 6 figures, 2 tables +
+
+
+
+
+ + ♻ SLIM: Sim-to-Real Legged Instructive Manipulation via Long-Horizon + Visuomotor Learning + + +
+ We present a low-cost legged mobile manipulation system that solves +long-horizon real-world tasks, trained by reinforcement learning purely in +simulation. This system is made possible by 1) a hierarchical design of a +high-level policy for visual-mobile manipulation following instructions and a +low-level policy for quadruped movement and limb control, 2) a progressive +exploration and learning approach that leverages privileged task decomposition +information to train the teacher policy for long-horizon tasks, which will +guide an imitation-based student policy for efficient training of the +high-level visuomotor policy, and 3) a suite of techniques for minimizing +sim-to-real gaps. + In contrast to previous approaches that use high-end equipment, our system +demonstrates effective performance with more accessible hardware - +specifically, a Unitree Go1 quadruped, a WidowX250S arm, and a single +wrist-mounted RGB camera - despite the increased challenges of sim-to-real +transfer. When fully trained in simulation, a single policy autonomously solves +long-horizon tasks such as search, move, grasp, and drop-into, achieving nearly +80% success. This performance is comparable to that of expert human +teleoperation on the same tasks but significantly more efficient, operating at +about 1.5x the speed. The sim-to-real transfer is fluid across diverse indoor +and outdoor scenes under varying lighting conditions. Finally, we discuss the +key techniques that enable the entire pipeline, including efficient RL training +and sim-to-real, to work effectively for legged mobile manipulation, and +present their ablation results. + +
+
+
+
+
+ + ♻ ☆ Imperative Learning: A Self-supervised Neuro-Symbolic Learning Framework + for Robot Autonomy + + +
+ Data-driven methods such as reinforcement and imitation learning have +achieved remarkable success in robot autonomy. However, their data-centric +nature still hinders them from generalizing well to ever-changing environments. +Moreover, collecting large datasets for robotic tasks is often impractical and +expensive. To overcome these challenges, we introduce a new self-supervised +neuro-symbolic (NeSy) computational framework, imperative learning (IL), for +robot autonomy, leveraging the generalization abilities of symbolic reasoning. +The framework of IL consists of three primary components: a neural module, a +reasoning engine, and a memory system. We formulate IL as a special bilevel +optimization (BLO), which enables reciprocal learning over the three modules. +This overcomes the label-intensive obstacles associated with data-driven +approaches and takes advantage of symbolic reasoning concerning logical +reasoning, physical principles, geometric analysis, etc. We discuss several +optimization techniques for IL and verify their effectiveness in five distinct +robot autonomy tasks including path planning, rule induction, optimal control, +visual odometry, and multi-robot routing. Through various experiments, we show +that IL can significantly enhance robot autonomy capabilities and we anticipate +that it will catalyze further research across diverse domains. + +
+
+
+
+
+ + ♻ ☆ Towards Robust Spacecraft Trajectory Optimization via Transformers + + +
+ Future multi-spacecraft missions require robust autonomous trajectory +optimization capabilities to ensure safe and efficient rendezvous operations. +This capability hinges on solving non-convex optimal control problems in +real-time, although traditional iterative methods such as sequential convex +programming impose significant computational challenges. To mitigate this +burden, the Autonomous Rendezvous Transformer (ART) introduced a generative +model trained to provide near-optimal initial guesses. This approach provides +convergence to better local optima (e.g., fuel optimality), improves +feasibility rates, and results in faster convergence speed of optimization +algorithms through warm-starting. This work extends the capabilities of ART to +address robust chance-constrained optimal control problems. Specifically, ART +is applied to challenging rendezvous scenarios in Low Earth Orbit (LEO), +ensuring fault-tolerant behavior under uncertainty. Through extensive +experimentation, the proposed warm-starting strategy is shown to consistently +produce high-quality reference trajectories, achieving up to 30\% cost +improvement and 50\% reduction in infeasible cases compared to conventional +methods, demonstrating robust performance across multiple state +representations. Additionally, a post hoc evaluation framework is proposed to +assess the quality of generated trajectories and mitigate runtime failures, +marking an initial step toward the reliable deployment of AI-driven solutions +in safety-critical autonomous systems such as spacecraft. + +
+
+ comment: Submitted to the IEEE Aerospace Conference 2025. 13 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Online Fault Tolerance Strategy for Abrupt Reachability Constraint + Changes + + +
+ When a system's constraints change abruptly, the system's reachability safety +does no longer sustain. Thus, the system can reach a forbidden/dangerous value. +Conventional remedy practically involves online controller redesign (OCR) to +re-establish the reachability's compliance with the new constraints, which, +however, is usually too slow. There is a need for an online strategy capable of +managing runtime changes in reachability constraints. However, to the best of +the authors' knowledge, this topic has not been addressed in the existing +literature. In this paper, we propose a fast fault tolerance strategy to +recover the system's reachability safety in runtime. Instead of redesigning the +system's controller, we propose to change the system's reference state to +modify the system's reachability to comply with the new constraints. We frame +the reference state search as an optimization problem and employ the +Karush-Kuhn-Tucker (KKT) method as well as the Interior Point Method (IPM) +based Newton's method (as a fallback for the KKT method) for fast solution +derivation. The optimization also allows more future fault tolerance. Numerical +simulations demonstrate that our method outperforms the conventional OCR method +in terms of computational efficiency and success rate. Specifically, the +results show that the proposed method finds a solution $10^{2}$ (with the IPM +based Newton's method) $\sim 10^{4}$ (with the KKT method) times faster than +the OCR method. Additionally, the improvement rate of the success rate of our +method over the OCR method is $40.81\%$ without considering the deadline of run +time. The success rate remains at $49.44\%$ for the proposed method, while it +becomes $0\%$ for the OCR method when a deadline of $1.5 \; seconds$ is +imposed. + +
+
+ comment: 9 pages, 2 figures, +
+
+
+
+
+ + ♻ ☆ Neural Scaling Laws in Robotics + + +
+ Neural scaling laws have driven significant advancements in machine learning, +particularly in domains like language modeling and computer vision. However, +the exploration of neural scaling laws within robotics has remained relatively +underexplored, despite the growing adoption of foundation models in this field. +This paper represents the first comprehensive study to quantify neural scaling +laws for Robot Foundation Models (RFMs) and Large Language Models (LLMs) in +robotics tasks. Through a meta-analysis of 327 research papers, we investigate +how data size, model size, and compute resources influence downstream +performance across a diverse set of robotic tasks. Consistent with previous +scaling law research, our results reveal that the performance of robotic models +improves with increased resources, following a power-law relationship. +Promisingly, the improvement in robotic task performance scales notably faster +than language tasks. This suggests that, while performance on downstream +robotic tasks today is often moderate-to-poor, increased data and compute are +likely to signficantly improve performance in the future. Also consistent with +previous scaling law research, we also observe the emergence of new robot +capabilities as models scale. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 27 + +
+
+
+ + ☆ A Predictive Approach for Enhancing Accuracy in Remote Robotic Surgery + Using Informer Model + + +
+ Precise and real-time estimation of the robotic arm's position on the +patient's side is essential for the success of remote robotic surgery in +Tactile Internet (TI) environments. This paper presents a prediction model +based on the Transformer-based Informer framework for accurate and efficient +position estimation. Additionally, it combines a Four-State Hidden Markov Model +(4-State HMM) to simulate realistic packet loss scenarios. The proposed +approach addresses challenges such as network delays, jitter, and packet loss +to ensure reliable and precise operation in remote surgical applications. The +method integrates the optimization problem into the Informer model by embedding +constraints such as energy efficiency, smoothness, and robustness into its +training process using a differentiable optimization layer. The Informer +framework uses features such as ProbSparse attention, attention distilling, and +a generative-style decoder to focus on position-critical features while +maintaining a low computational complexity of O(L log L). The method is +evaluated using the JIGSAWS dataset, achieving a prediction accuracy of over 90 +percent under various network scenarios. A comparison with models such as TCN, +RNN, and LSTM demonstrates the Informer framework's superior performance in +handling position prediction and meeting real-time requirements, making it +suitable for Tactile Internet-enabled robotic surgery. + +
+
+
+
+
+ + ☆ Gaussian-Process-based Adaptive Tracking Control with Dynamic Active + Learning for Autonomous Ground Vehicles + + +
+ This article proposes an active-learning-based adaptive trajectory tracking +control method for autonomous ground vehicles to compensate for modeling errors +and unmodeled dynamics. The nominal vehicle model is decoupled into lateral and +longitudinal subsystems, which are augmented with online Gaussian Processes +(GPs), using measurement data. The estimated mean functions of the GPs are used +to construct a feedback compensator, which, together with an LPV state feedback +controller designed for the nominal system, gives the adaptive control +structure. To assist exploration of the dynamics, the paper proposes a new, +dynamic active learning method to collect the most informative samples to +accelerate the training process. To analyze the performance of the overall +learning tool-chain provided controller, a novel iterative, +counterexample-based algorithm is proposed for calculating the induced L2 gain +between the reference trajectory and the tracking error. The analysis can be +executed for a set of possible realizations of the to-be-controlled system, +giving robust performance certificate of the learning method under variation of +the vehicle dynamics. The efficiency of the proposed control approach is shown +on a high-fidelity physics simulator and in real experiments using a 1/10 scale +F1TENTH electric car. + +
+
+ comment: Submitted to IEEE Transactions on Control Systems Technology +
+
+
+
+
+ + ☆ QuIP: Experimental design for expensive simulators with many Qualitative + factors via Integer Programming + + +
+ The need to explore and/or optimize expensive simulators with many +qualitative factors arises in broad scientific and engineering problems. Our +motivating application lies in path planning - the exploration of feasible +paths for navigation, which plays an important role in robotics, surgical +planning and assembly planning. Here, the feasibility of a path is evaluated +via expensive virtual experiments, and its parameter space is typically +discrete and high-dimensional. A carefully selected experimental design is thus +essential for timely decision-making. We propose here a novel framework, called +QuIP, for experimental design of Qualitative factors via Integer Programming +under a Gaussian process surrogate model with an exchangeable covariance +function. For initial design, we show that its asymptotic D-optimal design can +be formulated as a variant of the well-known assignment problem in operations +research, which can be efficiently solved to global optimality using +state-of-the-art integer programming solvers. For sequential design +(specifically, for active learning or black-box optimization), we show that its +design criterion can similarly be formulated as an assignment problem, thus +enabling efficient and reliable optimization with existing solvers. We then +demonstrate the effectiveness of QuIP over existing methods in a suite of path +planning experiments and an application to rover trajectory optimization. + +
+
+ comment: 40 pages, 6 figures, submitted to JCGS +
+
+
+
+
+ + ☆ Visual Localization via Semantic Structures in Autonomous Photovoltaic + Power Plant Inspection + + +
+ Inspection systems utilizing unmanned aerial vehicles (UAVs) equipped with +thermal cameras are increasingly popular for the maintenance of photovoltaic +(PV) power plants. However, automation of the inspection task is a challenging +problem as it requires precise navigation to capture images from optimal +distances and viewing angles. + This paper presents a novel localization pipeline that directly integrates PV +module detection with UAV navigation, allowing precise positioning during +inspection. Detections are used to identify the power plant structures in the +image and associate these with the power plant model. We define visually +recognizable anchor points for the initial association and use object tracking +to discern global associations. We present three distinct methods for visual +segmentation of PV modules based on traditional computer vision, deep learning, +and their fusion, and we evaluate their performance in relation to the proposed +localization pipeline. + The presented methods were verified and evaluated using custom aerial +inspection data sets, demonstrating their robustness and applicability for +real-time navigation. Additionally, we evaluate the influence of the power +plant model's precision on the localization methods. + +
+
+ comment: 47 pages, 22 figures +
+
+
+
+
+ + ☆ Optimizing Grasping Precision for Industrial Pick-and-Place Tasks + Through a Novel Visual Servoing Approach + + +
+ The integration of robotic arm manipulators into industrial manufacturing +lines has become common, thanks to their efficiency and effectiveness in +executing specific tasks. With advancements in camera technology, visual +sensors and perception systems have been incorporated to address more complex +operations. This study introduces a novel visual serving control system +designed for robotic operations in challenging environments, where accurate +object pose estimation is hindered by factors such as vibrations, tool path +deviations, and machining marks. To overcome these obstacles, our solution +focuses on enhancing the accuracy of picking and placing tasks, ensuring +reliable performance across various scenarios. This is accomplished by a novel +visual servoing method based on the integration of two complementary +methodologies: a technique for object localization and a separate approach for +precise control through visual feedback, leveraging their strengths to address +the challenges posed by the industrial context and thereby improving overall +grasping accuracy. Our method employ feedback from perception sensors to adjust +the control loop efficiently, enabling the robotic system to adeptly pick and +place objects. We have introduced a controller capable of seamlessly managing +the detection and manipulation of various shapes and types of objects within an +industrial context, addressing numerous challenges that arise in such +environments. + +
+
+
+
+
+ + ☆ Robustified Time-optimal Point-to-point Motion Planning and Control + under Uncertainty + + +
+ This paper proposes a novel approach to formulate time-optimal point-to-point +motion planning and control under uncertainty. The approach defines a +robustified two-stage Optimal Control Problem (OCP), in which stage 1, with a +fixed time grid, is seamlessly stitched with stage 2, which features a variable +time grid. Stage 1 optimizes not only the nominal trajectory, but also feedback +gains and corresponding state covariances, which robustify constraints in both +stages. The outcome is a minimized uncertainty in stage 1 and a minimized total +motion time for stage 2, both contributing to the time optimality and safety of +the total motion. A timely replanning strategy is employed to handle changes in +constraints and maintain feasibility, while a tailored iterative algorithm is +proposed for efficient, real-time OCP execution. + +
+
+
+
+
+ + ☆ ABPT: Amended Backpropagation through Time with Partially Differentiable + Rewards + + +
+ Using the exact gradients of the rewards to directly optimize policy +parameters via backpropagation-through-time (BPTT) enables high training +performance for quadrotor tasks. However, designing a fully differentiable +reward architecture is often challenging. Partially differentiable rewards will +result in biased gradient propagation that degrades training performance. To +overcome this limitation, we propose Amended Backpropagation-through-Time +(ABPT), a novel approach that mitigates gradient bias while preserving the +training efficiency of BPTT. ABPT combines 0-step and N-step returns, +effectively reducing the bias by leveraging value gradients from the learned +Q-value function. Additionally, it adopts entropy regularization and state +initialization mechanisms to encourage exploration during training. We evaluate +ABPT on four representative quadrotor flight tasks. Experimental results +demonstrate that ABPT converges significantly faster and achieves higher +ultimate rewards than existing learning algorithms, particularly in tasks +involving partially differentiable rewards. + +
+
+
+
+
+ + ☆ Benchmarking global optimization techniques for unmanned aerial vehicle + path planning + + +
+ The Unmanned Aerial Vehicle (UAV) path planning problem is a complex +optimization problem in the field of robotics. In this paper, we investigate +the possible utilization of this problem in benchmarking global optimization +methods. We devise a problem instance generator and pick 56 representative +instances, which we compare to established benchmarking suits through +Exploratory Landscape Analysis to show their uniqueness. For the computational +comparison, we select twelve well-performing global optimization techniques +from both subfields of stochastic algorithms (evolutionary computation methods) +and deterministic algorithms (Dividing RECTangles, or DIRECT-type methods). The +experiments were conducted in settings with varying dimensionality and +computational budgets. The results were analyzed through several criteria +(number of best-found solutions, mean relative error, Friedman ranks) and +utilized established statistical tests. The best-ranking methods for the UAV +problems were almost universally the top-performing evolutionary techniques +from recent competitions on numerical optimization at the Institute of +Electrical and Electronics Engineers Congress on Evolutionary Computation. +Lastly, we discussed the variable dimension characteristics of the studied UAV +problems that remain still largely under-investigated. + +
+
+
+
+
+ + ☆ LiDAR-Based Vehicle Detection and Tracking for Autonomous Racing + + +
+ Autonomous racing provides a controlled environment for testing the software +and hardware of autonomous vehicles operating at their performance limits. +Competitive interactions between multiple autonomous racecars however introduce +challenging and potentially dangerous scenarios. Accurate and consistent +vehicle detection and tracking is crucial for overtaking maneuvers, and +low-latency sensor processing is essential to respond quickly to hazardous +situations. This paper presents the LiDAR-based perception algorithms deployed +on Team PoliMOVE's autonomous racecar, which won multiple competitions in the +Indy Autonomous Challenge series. Our Vehicle Detection and Tracking pipeline +is composed of a novel fast Point Cloud Segmentation technique and a specific +Vehicle Pose Estimation methodology, together with a variable-step Multi-Target +Tracking algorithm. Experimental results demonstrate the algorithm's +performance, robustness, computational efficiency, and suitability for +autonomous racing applications, enabling fully autonomous overtaking maneuvers +at velocities exceeding 275 km/h. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Visual-Lidar Map Alignment for Infrastructure Inspections + + +
+ Routine and repetitive infrastructure inspections present safety, efficiency, +and consistency challenges as they are performed manually, often in challenging +or hazardous environments. They can also introduce subjectivity and errors into +the process, resulting in undesirable outcomes. Simultaneous localization and +mapping (SLAM) presents an opportunity to generate high-quality 3D maps that +can be used to extract accurate and objective inspection data. Yet, many SLAM +algorithms are limited in their ability to align 3D maps from repeated +inspections in GPS-denied settings automatically. This limitation hinders +practical long-term asset health assessments by requiring tedious manual +alignment for data association across scans from previous inspections. This +paper introduces a versatile map alignment algorithm leveraging both visual and +lidar data for improved place recognition robustness and presents an +infrastructure-focused dataset tailored for consecutive inspections. By +detaching map alignment from SLAM, our approach enhances infrastructure +inspection pipelines, supports monitoring asset degradation over time, and +invigorates SLAM research by permitting exploration beyond existing +multi-session SLAM algorithms. + +
+
+ comment: 8 pages, 8 figures, for associated code see + https://github.com/jakemclaughlin6/vlma +
+
+
+
+
+ + ☆ MARL-OT: Multi-Agent Reinforcement Learning Guided Online Fuzzing to + Detect Safety Violation in Autonomous Driving Systems + + +
+ Autonomous Driving Systems (ADSs) are safety-critical, as real-world safety +violations can result in significant losses. Rigorous testing is essential +before deployment, with simulation testing playing a key role. However, ADSs +are typically complex, consisting of multiple modules such as perception and +planning, or well-trained end-to-end autonomous driving systems. Offline +methods, such as the Genetic Algorithm (GA), can only generate predefined +trajectories for dynamics, which struggle to cause safety violations for ADSs +rapidly and efficiently in different scenarios due to their evolutionary +nature. Online methods, such as single-agent reinforcement learning (RL), can +quickly adjust the dynamics' trajectory online to adapt to different scenarios, +but they struggle to capture complex corner cases of ADS arising from the +intricate interplay among multiple vehicles. Multi-agent reinforcement learning +(MARL) has a strong ability in cooperative tasks. On the other hand, it faces +its own challenges, particularly with convergence. This paper introduces +MARL-OT, a scalable framework that leverages MARL to detect safety violations +of ADS resulting from surrounding vehicles' cooperation. MARL-OT employs MARL +for high-level guidance, triggering various dangerous scenarios for the +rule-based online fuzzer to explore potential safety violations of ADS, thereby +generating dynamic, realistic safety violation scenarios. Our approach improves +the detected safety violation rate by up to 136.2% compared to the +state-of-the-art (SOTA) testing technique. + +
+
+
+
+
+ + ☆ Learning more with the same effort: how randomization improves the + robustness of a robotic deep reinforcement learning agent + + +
+ The industrial application of Deep Reinforcement Learning (DRL) is frequently +slowed down because of the inability to generate the experience required to +train the models. Collecting data often involves considerable time and economic +effort that is unaffordable in most cases. Fortunately, devices like robots can +be trained with synthetic experience thanks to virtual environments. With this +approach, the sample efficiency problems of artificial agents are mitigated, +but another issue arises: the need for efficiently transferring the synthetic +experience into the real world (sim-to-real). + This paper analyzes the robustness of a state-of-the-art sim-to-real +technique known as progressive neural networks (PNNs) and studies how adding +diversity to the synthetic experience can complement it. To better understand +the drivers that lead to a lack of robustness, the robotic agent is still +tested in a virtual environment to ensure total control on the divergence +between the simulated and real models. + The results show that a PNN-like agent exhibits a substantial decrease in its +robustness at the beginning of the real training phase. Randomizing certain +variables during simulation-based training significantly mitigates this issue. +On average, the increase in the model's accuracy is around 25% when diversity +is introduced in the training process. This improvement can be translated into +a decrease in the required real experience for the same final robustness +performance. Notwithstanding, adding real experience to agents should still be +beneficial regardless of the quality of the virtual experience fed into the +agent. + +
+
+ comment: This article was accepted and published in Applied Intelligence + (10.1007/s10489-022-04227-3) +
+
+
+
+
+ + ☆ SKIL: Semantic Keypoint Imitation Learning for Generalizable + Data-efficient Manipulation + + +
+ Real-world tasks such as garment manipulation and table rearrangement demand +robots to perform generalizable, highly precise, and long-horizon actions. +Although imitation learning has proven to be an effective approach for teaching +robots new skills, large amounts of expert demonstration data are still +indispensible for these complex tasks, resulting in high sample complexity and +costly data collection. To address this, we propose Semantic Keypoint Imitation +Learning (SKIL), a framework which automatically obtain semantic keypoints with +help of vision foundation models, and forms the descriptor of semantic +keypoints that enables effecient imitation learning of complex robotic tasks +with significantly lower sample complexity. In real world experiments, SKIL +doubles the performance of baseline methods in tasks such as picking a cup or +mouse, while demonstrating exceptional robustness to variations in objects, +environmental changes, and distractors. For long-horizon tasks like hanging a +towel on a rack where previous methods fail completely, SKIL achieves a mean +success rate of 70\% with as few as 30 demonstrations. Furthermore, SKIL +naturally supports cross-embodiment learning due to its semantic keypoints +abstraction, our experiments demonstrate that even human videos bring +considerable improvement to the learning performance. All these results +demonstrate the great success of SKIL in achieving data-efficint generalizable +robotic learning. Visualizations and code are available at: +https://skil-robotics.github.io/SKIL-robotics/. + +
+
+ comment: 22 pages, 22 figures +
+
+
+
+
+ + ☆ Dream to Fly: Model-Based Reinforcement Learning for Vision-Based Drone + Flight + + +
+ Autonomous drone racing has risen as a challenging robotic benchmark for +testing the limits of learning, perception, planning, and control. Expert human +pilots are able to agilely fly a drone through a race track by mapping the +real-time feed from a single onboard camera directly to control commands. +Recent works in autonomous drone racing attempting direct pixel-to-commands +control policies (without explicit state estimation) have relied on either +intermediate representations that simplify the observation space or performed +extensive bootstrapping using Imitation Learning (IL). This paper introduces an +approach that learns policies from scratch, allowing a quadrotor to +autonomously navigate a race track by directly mapping raw onboard camera +pixels to control commands, just as human pilots do. By leveraging model-based +reinforcement learning~(RL) - specifically DreamerV3 - we train visuomotor +policies capable of agile flight through a race track using only raw pixel +observations. While model-free RL methods such as PPO struggle to learn under +these conditions, DreamerV3 efficiently acquires complex visuomotor behaviors. +Moreover, because our policies learn directly from pixel inputs, the +perception-aware reward term employed in previous RL approaches to guide the +training process is no longer needed. Our experiments demonstrate in both +simulation and real-world flight how the proposed approach can be deployed on +agile quadrotors. This approach advances the frontier of vision-based +autonomous flight and shows that model-based RL is a promising direction for +real-world robotics. + +
+
+ comment: 11 pages, 7 Figures +
+
+
+
+
+ + ☆ Scalable Benchmarking and Robust Learning for Noise-Free Ego-Motion and + 3D Reconstruction from Noisy Video ICLR 2025 + + +
+ We aim to redefine robust ego-motion estimation and photorealistic 3D +reconstruction by addressing a critical limitation: the reliance on noise-free +data in existing models. While such sanitized conditions simplify evaluation, +they fail to capture the unpredictable, noisy complexities of real-world +environments. Dynamic motion, sensor imperfections, and synchronization +perturbations lead to sharp performance declines when these models are deployed +in practice, revealing an urgent need for frameworks that embrace and excel +under real-world noise. To bridge this gap, we tackle three core challenges: +scalable data generation, comprehensive benchmarking, and model robustness +enhancement. First, we introduce a scalable noisy data synthesis pipeline that +generates diverse datasets simulating complex motion, sensor imperfections, and +synchronization errors. Second, we leverage this pipeline to create +Robust-Ego3D, a benchmark rigorously designed to expose noise-induced +performance degradation, highlighting the limitations of current learning-based +methods in ego-motion accuracy and 3D reconstruction quality. Third, we propose +Correspondence-guided Gaussian Splatting (CorrGS), a novel test-time adaptation +method that progressively refines an internal clean 3D representation by +aligning noisy observations with rendered RGB-D frames from clean 3D map, +enhancing geometric alignment and appearance restoration through visual +correspondence. Extensive experiments on synthetic and real-world data +demonstrate that CorrGS consistently outperforms prior state-of-the-art +methods, particularly in scenarios involving rapid motion and dynamic +illumination. + +
+
+ comment: Accepted by ICLR 2025; 92 Pages; Project Repo: + https://github.com/Xiaohao-Xu/SLAM-under-Perturbation. arXiv admin note: + substantial text overlap with arXiv:2406.16850 +
+
+
+
+
+ + ☆ Enhancing Robotic Precision in Construction: A Modular Factor + Graph-Based Framework to Deflection and Backlash Compensation Using + High-Accuracy Accelerometers + + +
+ Accurate positioning is crucial in the construction industry, where labor +shortages highlight the need for automation. Robotic systems with long +kinematic chains are required to reach complex workspaces, including floors, +walls, and ceilings. These requirements significantly impact positioning +accuracy due to effects such as deflection and backlash in various parts along +the kinematic chain. In this work, we introduce a novel approach that +integrates deflection and backlash compensation models with high-accuracy +accelerometers, significantly enhancing position accuracy. Our method employs a +modular framework based on a factor graph formulation to estimate the state of +the kinematic chain, leveraging acceleration measurements to inform the model. +Extensive testing on publicly released datasets, reflecting real-world +construction disturbances, demonstrates the advantages of our approach. The +proposed method reduces the $95\%$ error threshold in the xy-plane by $50\%$ +compared to the state-of-the-art Virtual Joint Method, and by $31\%$ when +incorporating base tilt compensation. + +
+
+ comment: 8 pages, 7 figures, Accepted on November 2024 at IEEE Robotics and + Automation Letters +
+
+
+
+
+ + ☆ Point-LN: A Lightweight Framework for Efficient Point Cloud + Classification Using Non-Parametric Positional Encoding + + +
+ We introduce Point-LN, a novel lightweight framework engineered for efficient +3D point cloud classification. Point-LN integrates essential non-parametric +components-such as Farthest Point Sampling (FPS), k-Nearest Neighbors (k-NN), +and non-learnable positional encoding-with a streamlined learnable classifier +that significantly enhances classification accuracy while maintaining a minimal +parameter footprint. This hybrid architecture ensures low computational costs +and rapid inference speeds, making Point-LN ideal for real-time and +resource-constrained applications. Comprehensive evaluations on benchmark +datasets, including ModelNet40 and ScanObjectNN, demonstrate that Point-LN +achieves competitive performance compared to state-of-the-art methods, all +while offering exceptional efficiency. These results establish Point-LN as a +robust and scalable solution for diverse point cloud classification tasks, +highlighting its potential for widespread adoption in various computer vision +applications. + +
+
+ comment: This paper has been accepted for presentation at the 29th + International Computer Conference, Computer Society of Iran (CSICC) 2025 +
+
+
+
+
+ + ☆ You Only Teach Once: Learn One-Shot Bimanual Robotic Manipulation from + Video Demonstrations + + +
+ Bimanual robotic manipulation is a long-standing challenge of embodied +intelligence due to its characteristics of dual-arm spatial-temporal +coordination and high-dimensional action spaces. Previous studies rely on +pre-defined action taxonomies or direct teleoperation to alleviate or +circumvent these issues, often making them lack simplicity, versatility and +scalability. Differently, we believe that the most effective and efficient way +for teaching bimanual manipulation is learning from human demonstrated videos, +where rich features such as spatial-temporal positions, dynamic postures, +interaction states and dexterous transitions are available almost for free. In +this work, we propose the YOTO (You Only Teach Once), which can extract and +then inject patterns of bimanual actions from as few as a single binocular +observation of hand movements, and teach dual robot arms various complex tasks. +Furthermore, based on keyframes-based motion trajectories, we devise a subtle +solution for rapidly generating training demonstrations with diverse variations +of manipulated objects and their locations. These data can then be used to +learn a customized bimanual diffusion policy (BiDP) across diverse scenes. In +experiments, YOTO achieves impressive performance in mimicking 5 intricate +long-horizon bimanual tasks, possesses strong generalization under different +visual and spatial conditions, and outperforms existing visuomotor imitation +learning methods in accuracy and efficiency. Our project link is +https://hnuzhy.github.io/projects/YOTO. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ RaccoonBot: An Autonomous Wire-Traversing Solar-Tracking Robot for + Persistent Environmental Monitoring ICRA 2025 + + +
+ Environmental monitoring is used to characterize the health and relationship +between organisms and their environments. In forest ecosystems, robots can +serve as platforms to acquire such data, even in hard-to-reach places where +wire-traversing platforms are particularly promising due to their efficient +displacement. This paper presents the RaccoonBot, which is a novel autonomous +wire-traversing robot for persistent environmental monitoring, featuring a +fail-safe mechanical design with a self-locking mechanism in case of electrical +shortage. The robot also features energy-aware mobility through a novel Solar +tracking algorithm, that allows the robot to find a position on the wire to +have direct contact with solar power to increase the energy harvested. +Experimental results validate the electro-mechanical features of the +RaccoonBot, showing that it is able to handle wire perturbations, different +inclinations, and achieving energy autonomy. + +
+
+ comment: Pre-print submitted to the 2025 IEEE International Conference on + Robotics & Automation (ICRA 2025) +
+
+
+
+
+ + ☆ HAMMER: Heterogeneous, Multi-Robot Semantic Gaussian Splatting + + +
+ 3D Gaussian Splatting offers expressive scene reconstruction, modeling a +broad range of visual, geometric, and semantic information. However, efficient +real-time map reconstruction with data streamed from multiple robots and +devices remains a challenge. To that end, we propose HAMMER, a server-based +collaborative Gaussian Splatting method that leverages widely available ROS +communication infrastructure to generate 3D, metric-semantic maps from +asynchronous robot data-streams with no prior knowledge of initial robot +positions and varying on-device pose estimators. HAMMER consists of (i) a frame +alignment module that transforms local SLAM poses and image data into a global +frame and requires no prior relative pose knowledge, and (ii) an online module +for training semantic 3DGS maps from streaming data. HAMMER handles mixed +perception modes, adjusts automatically for variations in image pre-processing +among different devices, and distills CLIP semantic codes into the 3D scene for +open-vocabulary language queries. In our real-world experiments, HAMMER creates +higher-fidelity maps (2x) compared to competing baselines and is useful for +downstream tasks, such as semantic goal-conditioned navigation (e.g., ``go to +the couch"). Accompanying content available at hammer-project.github.io. + +
+
+
+
+
+ + ☆ Force-Based Robotic Imitation Learning: A Two-Phase Approach for + Construction Assembly Tasks + + +
+ The drive for efficiency and safety in construction has boosted the role of +robotics and automation. However, complex tasks like welding and pipe insertion +pose challenges due to their need for precise adaptive force control, which +complicates robotic training. This paper proposes a two-phase system to improve +robot learning, integrating human-derived force feedback. The first phase +captures real-time data from operators using a robot arm linked with a virtual +simulator via ROS-Sharp. In the second phase, this feedback is converted into +robotic motion instructions, using a generative approach to incorporate force +feedback into the learning process. This method's effectiveness is demonstrated +through improved task completion times and success rates. The framework +simulates realistic force-based interactions, enhancing the training data's +quality for precise robotic manipulation in construction tasks. + +
+
+ comment: 36 pages +
+
+
+
+
+ + ☆ Temporal Binding Foundation Model for Material Property Recognition via + Tactile Sequence Perception + + +
+ Robots engaged in complex manipulation tasks require robust material property +recognition to ensure adaptability and precision. Traditionally, visual data +has been the primary source for object perception; however, it often proves +insufficient in scenarios where visibility is obstructed or detailed +observation is needed. This gap highlights the necessity of tactile sensing as +a complementary or primary input for material recognition. Tactile data becomes +particularly essential in contact-rich, small-scale manipulations where subtle +deformations and surface interactions cannot be accurately captured by vision +alone. This letter presents a novel approach leveraging a temporal binding +foundation model for tactile sequence understanding to enhance material +property recognition. By processing tactile sensor data with a temporal focus, +the proposed system captures the sequential nature of tactile interactions, +similar to human fingertip perception. Additionally, this letter demonstrates +that, through tailored and specific design, the foundation model can more +effectively capture temporal information embedded in tactile sequences, +advancing material property understanding. Experimental results validate the +model's capability to capture these temporal patterns, confirming its utility +for material property recognition in visually restricted scenarios. This work +underscores the necessity of embedding advanced tactile data processing +frameworks within robotic systems to achieve truly embodied and responsive +manipulation capabilities. + +
+
+ comment: 4 pages, +
+
+
+
+
+ + ☆ Noise-conditioned Energy-based Annealed Rewards (NEAR): A Generative + Framework for Imitation Learning from Observation ICLR + + +
+ This paper introduces a new imitation learning framework based on +energy-based generative models capable of learning complex, physics-dependent, +robot motion policies through state-only expert motion trajectories. Our +algorithm, called Noise-conditioned Energy-based Annealed Rewards (NEAR), +constructs several perturbed versions of the expert's motion data distribution +and learns smooth, and well-defined representations of the data distribution's +energy function using denoising score matching. We propose to use these learnt +energy functions as reward functions to learn imitation policies via +reinforcement learning. We also present a strategy to gradually switch between +the learnt energy functions, ensuring that the learnt rewards are always +well-defined in the manifold of policy-generated samples. We evaluate our +algorithm on complex humanoid tasks such as locomotion and martial arts and +compare it with state-only adversarial imitation learning algorithms like +Adversarial Motion Priors (AMP). Our framework sidesteps the optimisation +challenges of adversarial imitation learning techniques and produces results +comparable to AMP in several quantitative metrics across multiple imitation +settings. + +
+
+ comment: Accepted as a conference paper at the International Conference on + Learning Representations (ICLR) 2025 +
+
+
+
+
+ + ♻ ☆ From One to the Power of Many: Invariance to Multi-LiDAR Perception from + Single-Sensor Datasets + + +
+ Recently, LiDAR segmentation methods for autonomous vehicles, powered by deep +neural networks, have experienced steep growth in performance on classic +benchmarks, such as nuScenes and SemanticKITTI. However, there are still large +gaps in performance when deploying models trained on such single-sensor setups +to modern vehicles with multiple high-resolution LiDAR sensors. In this work, +we introduce a new metric for feature-level invariance which can serve as a +proxy to measure cross-domain generalization without requiring labeled data. +Additionally, we propose two application-specific data augmentations, which +facilitate better transfer to multi-sensor LiDAR setups, when trained on +single-sensor datasets. We provide experimental evidence on both simulated and +real data, that our proposed augmentations improve invariance across LiDAR +setups, leading to improved generalization. + +
+
+ comment: Accepted for publication at the ML4AD Workshop @ AAAI Conference 2025 +
+
+
+
+
+ + ♻ ☆ RoboHorizon: An LLM-Assisted Multi-View World Model for Long-Horizon + Robotic Manipulation + + +
+ Efficient control in long-horizon robotic manipulation is challenging due to +complex representation and policy learning requirements. Model-based visual +reinforcement learning (RL) has shown great potential in addressing these +challenges but still faces notable limitations, particularly in handling sparse +rewards and complex visual features in long-horizon environments. To address +these limitations, we propose the Recognize-Sense-Plan-Act (RSPA) pipeline for +long-horizon tasks and further introduce RoboHorizon, an LLM-assisted +multi-view world model tailored for long-horizon robotic manipulation. In +RoboHorizon, pre-trained LLMs generate dense reward structures for multi-stage +sub-tasks based on task language instructions, enabling robots to better +recognize long-horizon tasks. Keyframe discovery is then integrated into the +multi-view masked autoencoder (MAE) architecture to enhance the robot's ability +to sense critical task sequences, strengthening its multi-stage perception of +long-horizon processes. Leveraging these dense rewards and multi-view +representations, a robotic world model is constructed to efficiently plan +long-horizon tasks, enabling the robot to reliably act through RL algorithms. +Experiments on two representative benchmarks, RLBench and FurnitureBench, show +that RoboHorizon outperforms state-of-the-art visual model-based RL methods, +achieving a 23.35% improvement in task success rates on RLBench's 4 +short-horizon tasks and a 29.23% improvement on 6 long-horizon tasks from +RLBench and 3 furniture assembly tasks from FurnitureBench. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ S3PT: Scene Semantics and Structure Guided Clustering to Boost + Self-Supervised Pre-Training for Autonomous Driving + + +
+ Recent self-supervised clustering-based pre-training techniques like DINO and +Cribo have shown impressive results for downstream detection and segmentation +tasks. However, real-world applications such as autonomous driving face +challenges with imbalanced object class and size distributions and complex +scene geometries. In this paper, we propose S3PT a novel scene semantics and +structure guided clustering to provide more scene-consistent objectives for +self-supervised training. Specifically, our contributions are threefold: First, +we incorporate semantic distribution consistent clustering to encourage better +representation of rare classes such as motorcycles or animals. Second, we +introduce object diversity consistent spatial clustering, to handle imbalanced +and diverse object sizes, ranging from large background areas to small objects +such as pedestrians and traffic signs. Third, we propose a depth-guided spatial +clustering to regularize learning based on geometric information of the scene, +thus further refining region separation on the feature level. Our learned +representations significantly improve performance in downstream semantic +segmentation and 3D object detection tasks on the nuScenes, nuImages, and +Cityscapes datasets and show promising domain translation properties. + +
+
+ comment: Accepted for WACV 2025 (Oral) +
+
+
+
+
+ + ♻ ☆ Deadlock-free, Safe, and Decentralized Multi-Robot Navigation in Social + Mini-Games via Discrete-Time Control Barrier Functions + + +
+ We present an approach to ensure safe and deadlock-free navigation for +decentralized multi-robot systems operating in constrained environments, +including doorways and intersections. Although many solutions have been +proposed that ensure safety and resolve deadlocks, optimally preventing +deadlocks in a minimally invasive and decentralized fashion remains an open +problem. We first formalize the objective as a non-cooperative, +non-communicative, partially observable multi-robot navigation problem in +constrained spaces with multiple conflicting agents, which we term as social +mini-games. Formally, we solve a discrete-time optimal receding horizon control +problem leveraging control barrier functions for safe long-horizon planning. +Our approach to ensuring liveness rests on the insight that \textit{there +exists barrier certificates that allow each robot to preemptively perturb their +state in a minimally-invasive fashion onto liveness sets i.e. states where +robots are deadlock-free}. We evaluate our approach in simulation as well on +physical robots using F$1/10$ robots, a Clearpath Jackal, as well as a Boston +Dynamics Spot in a doorway, hallway, and corridor intersection scenario. +Compared to both fully decentralized and centralized approaches with and +without deadlock resolution capabilities, we demonstrate that our approach +results in safer, more efficient, and smoother navigation, based on a +comprehensive set of metrics including success rate, collision rate, stop time, +change in velocity, path deviation, time-to-goal, and flow rate. + +
+
+ comment: major update since last revision +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 102 + +
+
+
+ + ☆ HERMES: A Unified Self-Driving World Model for Simultaneous 3D Scene + Understanding and Generation + + +
+ Driving World Models (DWMs) have become essential for autonomous driving by +enabling future scene prediction. However, existing DWMs are limited to scene +generation and fail to incorporate scene understanding, which involves +interpreting and reasoning about the driving environment. In this paper, we +present a unified Driving World Model named HERMES. We seamlessly integrate 3D +scene understanding and future scene evolution (generation) through a unified +framework in driving scenarios. Specifically, HERMES leverages a Bird's-Eye +View (BEV) representation to consolidate multi-view spatial information while +preserving geometric relationships and interactions. We also introduce world +queries, which incorporate world knowledge into BEV features via causal +attention in the Large Language Model (LLM), enabling contextual enrichment for +understanding and generation tasks. We conduct comprehensive studies on +nuScenes and OmniDrive-nuScenes datasets to validate the effectiveness of our +method. HERMES achieves state-of-the-art performance, reducing generation error +by 32.4% and improving understanding metrics such as CIDEr by 8.0%. The model +and code will be publicly released at https://github.com/LMD0311/HERMES. + +
+
+ comment: Work in progress. The code will be available at + https://github.com/LMD0311/HERMES +
+
+
+
+
+ + ☆ Mitigating GenAI-powered Evidence Pollution for Out-of-Context + Multimodal Misinformation Detection + + +
+ While large generative artificial intelligence (GenAI) models have achieved +significant success, they also raise growing concerns about online information +security due to their potential misuse for generating deceptive content. +Out-of-context (OOC) multimodal misinformation detection, which often retrieves +Web evidence to identify the repurposing of images in false contexts, faces the +issue of reasoning over GenAI-polluted evidence to derive accurate predictions. +Existing works simulate GenAI-powered pollution at the claim level with +stylistic rewriting to conceal linguistic cues, and ignore evidence-level +pollution for such information-seeking applications. In this work, we +investigate how polluted evidence affects the performance of existing OOC +detectors, revealing a performance degradation of more than 9 percentage +points. We propose two strategies, cross-modal evidence reranking and +cross-modal claim-evidence reasoning, to address the challenges posed by +polluted evidence. Extensive experiments on two benchmark datasets show that +these strategies can effectively enhance the robustness of existing +out-of-context detectors amidst polluted evidence. + +
+
+ comment: 12 pages, 11 figures +
+
+
+
+
+ + ☆ Relightable Full-Body Gaussian Codec Avatars + + +
+ We propose Relightable Full-Body Gaussian Codec Avatars, a new approach for +modeling relightable full-body avatars with fine-grained details including face +and hands. The unique challenge for relighting full-body avatars lies in the +large deformations caused by body articulation and the resulting impact on +appearance caused by light transport. Changes in body pose can dramatically +change the orientation of body surfaces with respect to lights, resulting in +both local appearance changes due to changes in local light transport +functions, as well as non-local changes due to occlusion between body parts. To +address this, we decompose the light transport into local and non-local +effects. Local appearance changes are modeled using learnable zonal harmonics +for diffuse radiance transfer. Unlike spherical harmonics, zonal harmonics are +highly efficient to rotate under articulation. This allows us to learn diffuse +radiance transfer in a local coordinate frame, which disentangles the local +radiance transfer from the articulation of the body. To account for non-local +appearance changes, we introduce a shadow network that predicts shadows given +precomputed incoming irradiance on a base mesh. This facilitates the learning +of non-local shadowing between the body parts. Finally, we use a deferred +shading approach to model specular radiance transfer and better capture +reflections and highlights such as eye glints. We demonstrate that our approach +successfully models both the local and non-local light transport required for +relightable full-body avatars, with a superior generalization ability under +novel illumination conditions and unseen poses. + +
+
+ comment: 14 pages, 9 figures. Project page: + https://neuralbodies.github.io/RFGCA +
+
+
+
+
+ + ☆ Enhanced Confocal Laser Scanning Microscopy with Adaptive Physics + Informed Deep Autoencoders + + +
+ We present a physics-informed deep learning framework to address common +limitations in Confocal Laser Scanning Microscopy (CLSM), such as diffraction +limited resolution, noise, and undersampling due to low laser power conditions. +The optical system's point spread function (PSF) and common CLSM image +degradation mechanisms namely photon shot noise, dark current noise, motion +blur, speckle noise, and undersampling were modeled and were directly included +into model architecture. The model reconstructs high fidelity images from +heavily noisy inputs by using convolutional and transposed convolutional +layers. Following the advances in compressed sensing, our approach +significantly reduces data acquisition requirements without compromising image +resolution. The proposed method was extensively evaluated on simulated CLSM +images of diverse structures, including lipid droplets, neuronal networks, and +fibrillar systems. Comparisons with traditional deconvolution algorithms such +as Richardson-Lucy (RL), non-negative least squares (NNLS), and other methods +like Total Variation (TV) regularization, Wiener filtering, and Wavelet +denoising demonstrate the superiority of the network in restoring fine +structural details with high fidelity. Assessment metrics like Structural +Similarity Index (SSIM) and Peak Signal to Noise Ratio (PSNR), underlines that +the AdaptivePhysicsAutoencoder achieved robust image enhancement across diverse +CLSM conditions, helping faster acquisition, reduced photodamage, and reliable +performance in low light and sparse sampling scenarios holding promise for +applications in live cell imaging, dynamic biological studies, and high +throughput material characterization. + +
+
+
+
+
+ + ☆ Stroke classification using Virtual Hybrid Edge Detection from in silico + electrical impedance tomography data + + +
+ Electrical impedance tomography (EIT) is a non-invasive imaging method for +recovering the internal conductivity of a physical body from electric boundary +measurements. EIT combined with machine learning has shown promise for the +classification of strokes. However, most previous works have used raw EIT +voltage data as network inputs. We build upon a recent development which +suggested the use of special noise-robust Virtual Hybrid Edge Detection (VHED) +functions as network inputs, although that work used only highly simplified and +mathematically ideal models. In this work we strengthen the case for the use of +EIT, and VHED functions especially, for stroke classification. We design models +with high detail and mathematical realism to test the use of VHED functions as +inputs. Virtual patients are created using a physically detailed 2D head model +which includes features known to create challenges in real-world imaging +scenarios. Conductivity values are drawn from statistically realistic +distributions, and phantoms are afflicted with either hemorrhagic or ischemic +strokes of various shapes and sizes. Simulated noisy EIT electrode data, +generated using the realistic Complete Electrode Model (CEM) as opposed to the +mathematically ideal continuum model, is processed to obtain VHED functions. We +compare the use of VHED functions as inputs against the alternative paradigm of +using raw EIT voltages. Our results show that (i) stroke classification can be +performed with high accuracy using 2D EIT data from physically detailed and +mathematically realistic models, and (ii) in the presence of noise, VHED +functions outperform raw data as network inputs. + +
+
+ comment: 21 pages, 5 figures +
+
+
+
+
+ + ☆ Approach to Designing CV Systems for Medical Applications: Data, + Architecture and AI + + +
+ This paper introduces an innovative software system for fundus image analysis +that deliberately diverges from the conventional screening approach, opting not +to predict specific diagnoses. Instead, our methodology mimics the diagnostic +process by thoroughly analyzing both normal and pathological features of fundus +structures, leaving the ultimate decision-making authority in the hands of +healthcare professionals. Our initiative addresses the need for objective +clinical analysis and seeks to automate and enhance the clinical workflow of +fundus image examination. The system, from its overarching architecture to the +modular analysis design powered by artificial intelligence (AI) models, aligns +seamlessly with ophthalmological practices. Our unique approach utilizes a +combination of state-of-the-art deep learning methods and traditional computer +vision algorithms to provide a comprehensive and nuanced analysis of fundus +structures. We present a distinctive methodology for designing medical +applications, using our system as an illustrative example. Comprehensive +verification and validation results demonstrate the efficacy of our approach in +revolutionizing fundus image analysis, with potential applications across +various medical domains. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ Rethinking Foundation Models for Medical Image Classification through a + Benchmark Study on MedMNIST + + +
+ Foundation models are widely employed in medical image analysis, due to their +high adaptability and generalizability for downstream tasks. With the +increasing number of foundation models being released, model selection has +become an important issue. In this work, we study the capabilities of +foundation models in medical image classification tasks by conducting a +benchmark study on the MedMNIST dataset. Specifically, we adopt various +foundation models ranging from convolutional to Transformer-based models and +implement both end-to-end training and linear probing for all classification +tasks. The results demonstrate the significant potential of these pre-trained +models when transferred for medical image classification. We further conduct +experiments with different image sizes and various sizes of training data. By +analyzing all the results, we provide preliminary, yet useful insights and +conclusions on this topic. + +
+
+ comment: submitted to MIDL2025 +
+
+
+
+
+ + ☆ Surface Vision Mamba: Leveraging Bidirectional State Space Model for + Efficient Spherical Manifold Representation + + +
+ Attention-based methods have demonstrated exceptional performance in +modelling long-range dependencies on spherical cortical surfaces, surpassing +traditional Geometric Deep Learning (GDL) models. However, their extensive +inference time and high memory demands pose challenges for application to large +datasets with limited computing resources. Inspired by the state space model in +computer vision, we introduce the attention-free Vision Mamba (Vim) to +spherical surfaces, presenting a domain-agnostic architecture for analyzing +data on spherical manifolds. Our method achieves surface patching by +representing spherical data as a sequence of triangular patches derived from a +subdivided icosphere. The proposed Surface Vision Mamba (SiM) is evaluated on +multiple neurodevelopmental phenotype regression tasks using cortical surface +metrics from neonatal brains. Experimental results demonstrate that SiM +outperforms both attention- and GDL-based methods, delivering 4.8 times faster +inference and achieving 91.7% lower memory consumption compared to the Surface +Vision Transformer (SiT) under the Ico-4 grid partitioning. Sensitivity +analysis further underscores the potential of SiM to identify subtle cognitive +developmental patterns. The code is available at +https://github.com/Rongzhao-He/surface-vision-mamba. + +
+
+
+
+
+ + ☆ MatAnyone: Stable Video Matting with Consistent Memory Propagation + + +
+ Auxiliary-free human video matting methods, which rely solely on input +frames, often struggle with complex or ambiguous backgrounds. To address this, +we propose MatAnyone, a robust framework tailored for target-assigned video +matting. Specifically, building on a memory-based paradigm, we introduce a +consistent memory propagation module via region-adaptive memory fusion, which +adaptively integrates memory from the previous frame. This ensures semantic +stability in core regions while preserving fine-grained details along object +boundaries. For robust training, we present a larger, high-quality, and diverse +dataset for video matting. Additionally, we incorporate a novel training +strategy that efficiently leverages large-scale segmentation data, boosting +matting stability. With this new network design, dataset, and training +strategy, MatAnyone delivers robust and accurate video matting results in +diverse real-world scenarios, outperforming existing methods. + +
+
+ comment: Project page: https://pq-yang.github.io/projects/MatAnyone +
+
+
+
+
+ + ☆ Towards Unified Structured Light Optimization + + +
+ Structured light (SL) 3D reconstruction captures the precise surface shape of +objects, providing high-accuracy 3D data essential for industrial inspection +and robotic vision systems. However, current research on optimizing projection +patterns in SL 3D reconstruction faces two main limitations: each scene +requires separate training of calibration parameters, and optimization is +restricted to specific types of SL, which restricts their application range. To +tackle these limitations, we present a unified framework for SL optimization, +adaptable to diverse lighting conditions, object types, and different types of +SL. Our framework quickly determines the optimal projection pattern using only +a single projected image. Key contributions include a novel global matching +method for projectors, enabling precise projector-camera alignment with just +one projected image, and a new projection compensation model with a photometric +adjustment module to reduce artifacts from out-of-gamut clipping. Experimental +results show our method achieves superior decoding accuracy across various +objects, SL patterns, and lighting conditions, significantly outperforming +previous methods. + +
+
+
+
+
+ + ☆ SyncAnimation: A Real-Time End-to-End Framework for Audio-Driven Human + Pose and Talking Head Animation + + +
+ Generating talking avatar driven by audio remains a significant challenge. +Existing methods typically require high computational costs and often lack +sufficient facial detail and realism, making them unsuitable for applications +that demand high real-time performance and visual quality. Additionally, while +some methods can synchronize lip movement, they still face issues with +consistency between facial expressions and upper body movement, particularly +during silent periods. In this paper, we introduce SyncAnimation, the first +NeRF-based method that achieves audio-driven, stable, and real-time generation +of speaking avatar by combining generalized audio-to-pose matching and +audio-to-expression synchronization. By integrating AudioPose Syncer and +AudioEmotion Syncer, SyncAnimation achieves high-precision poses and expression +generation, progressively producing audio-synchronized upper body, head, and +lip shapes. Furthermore, the High-Synchronization Human Renderer ensures +seamless integration of the head and upper body, and achieves audio-sync lip. +The project page can be found at https://syncanimation.github.io + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ ReferDINO: Referring Video Object Segmentation with Visual Grounding + Foundations + + +
+ Referring video object segmentation (RVOS) aims to segment target objects +throughout a video based on a text description. Despite notable progress in +recent years, current RVOS models remain struggle to handle complicated object +descriptions due to their limited video-language understanding. To address this +limitation, we present \textbf{ReferDINO}, an end-to-end RVOS model that +inherits strong vision-language understanding from the pretrained visual +grounding foundation models, and is further endowed with effective temporal +understanding and object segmentation capabilities. In ReferDINO, we contribute +three technical innovations for effectively adapting the foundation models to +RVOS: 1) an object-consistent temporal enhancer that capitalizes on the +pretrained object-text representations to enhance temporal understanding and +object consistency; 2) a grounding-guided deformable mask decoder that +integrates text and grounding conditions to generate accurate object masks; 3) +a confidence-aware query pruning strategy that significantly improves the +object decoding efficiency without compromising performance. We conduct +extensive experiments on five public RVOS benchmarks to demonstrate that our +proposed ReferDINO outperforms state-of-the-art methods significantly. Project +page: \url{https://isee-laboratory.github.io/ReferDINO} + +
+
+ comment: Project page: https://isee-laboratory.github.io/ReferDINO +
+
+
+
+
+ + ☆ 3DLabelProp: Geometric-Driven Domain Generalization for LiDAR Semantic + Segmentation in Autonomous Driving + + +
+ Domain generalization aims to find ways for deep learning models to maintain +their performance despite significant domain shifts between training and +inference datasets. This is particularly important for models that need to be +robust or are costly to train. LiDAR perception in autonomous driving is +impacted by both of these concerns, leading to the emergence of various +approaches. This work addresses the challenge by proposing a geometry-based +approach, leveraging the sequential structure of LiDAR sensors, which sets it +apart from the learning-based methods commonly found in the literature. The +proposed method, called 3DLabelProp, is applied on the task of LiDAR Semantic +Segmentation (LSS). Through extensive experimentation on seven datasets, it is +demonstrated to be a state-of-the-art approach, outperforming both naive and +other domain generalization methods. + +
+
+
+
+
+ + ☆ Geometric Mean Improves Loss For Few-Shot Learning + + +
+ Few-shot learning (FSL) is a challenging task in machine learning, demanding +a model to render discriminative classification by using only a few labeled +samples. In the literature of FSL, deep models are trained in a manner of +metric learning to provide metric in a feature space which is well +generalizable to classify samples of novel classes; in the space, even a few +amount of labeled training examples can construct an effective classifier. In +this paper, we propose a novel FSL loss based on \emph{geometric mean} to embed +discriminative metric into deep features. In contrast to the other losses such +as utilizing arithmetic mean in softmax-based formulation, the proposed method +leverages geometric mean to aggregate pair-wise relationships among samples for +enhancing discriminative metric across class categories. The proposed loss is +not only formulated in a simple form but also is thoroughly analyzed in +theoretical ways to reveal its favorable characteristics which are favorable +for learning feature metric in FSL. In the experiments on few-shot image +classification tasks, the method produces competitive performance in comparison +to the other losses. + +
+
+
+
+
+ + ☆ Improved Vessel Segmentation with Symmetric Rotation-Equivariant U-Net + + +
+ Automated segmentation plays a pivotal role in medical image analysis and +computer-assisted interventions. Despite the promising performance of existing +methods based on convolutional neural networks (CNNs), they neglect useful +equivariant properties for images, such as rotational and reflection +equivariance. This limitation can decrease performance and lead to inconsistent +predictions, especially in applications like vessel segmentation where explicit +orientation is absent. While existing equivariant learning approaches attempt +to mitigate these issues, they substantially increase learning cost, model +size, or both. To overcome these challenges, we propose a novel application of +an efficient symmetric rotation-equivariant (SRE) convolutional (SRE-Conv) +kernel implementation to the U-Net architecture, to learn rotation and +reflection-equivariant features, while also reducing the model size +dramatically. We validate the effectiveness of our method through improved +segmentation performance on retina vessel fundus imaging. Our proposed SRE +U-Net not only significantly surpasses standard U-Net in handling rotated +images, but also outperforms existing equivariant learning methods and does so +with a reduced number of trainable parameters and smaller memory cost. The code +is available at https://github.com/OnofreyLab/sre_conv_segm_isbi2025. + +
+
+ comment: Accepted by IEEE ISBI 2025 +
+
+
+
+
+ + ☆ Visual Localization via Semantic Structures in Autonomous Photovoltaic + Power Plant Inspection + + +
+ Inspection systems utilizing unmanned aerial vehicles (UAVs) equipped with +thermal cameras are increasingly popular for the maintenance of photovoltaic +(PV) power plants. However, automation of the inspection task is a challenging +problem as it requires precise navigation to capture images from optimal +distances and viewing angles. + This paper presents a novel localization pipeline that directly integrates PV +module detection with UAV navigation, allowing precise positioning during +inspection. Detections are used to identify the power plant structures in the +image and associate these with the power plant model. We define visually +recognizable anchor points for the initial association and use object tracking +to discern global associations. We present three distinct methods for visual +segmentation of PV modules based on traditional computer vision, deep learning, +and their fusion, and we evaluate their performance in relation to the proposed +localization pipeline. + The presented methods were verified and evaluated using custom aerial +inspection data sets, demonstrating their robustness and applicability for +real-time navigation. Additionally, we evaluate the influence of the power +plant model's precision on the localization methods. + +
+
+ comment: 47 pages, 22 figures +
+
+
+
+
+ + ☆ Large-scale and Fine-grained Vision-language Pre-training for Enhanced + CT Image Understanding ICLR 2025 + + +
+ Artificial intelligence (AI) shows great potential in assisting radiologists +to improve the efficiency and accuracy of medical image interpretation and +diagnosis. However, a versatile AI model requires large-scale data and +comprehensive annotations, which are often impractical in medical settings. +Recent studies leverage radiology reports as a naturally high-quality +supervision for medical images, using contrastive language-image pre-training +(CLIP) to develop language-informed models for radiological image +interpretation. Nonetheless, these approaches typically contrast entire images +with reports, neglecting the local associations between imaging regions and +report sentences, which may undermine model performance and interoperability. +In this paper, we propose a fine-grained vision-language model (fVLM) for +anatomy-level CT image interpretation. Specifically, we explicitly match +anatomical regions of CT images with corresponding descriptions in radiology +reports and perform contrastive pre-training for each anatomy individually. +Fine-grained alignment, however, faces considerable false-negative challenges, +mainly from the abundance of anatomy-level healthy samples and similarly +diseased abnormalities. To tackle this issue, we propose identifying false +negatives of both normal and abnormal samples and calibrating contrastive +learning from patient-level to disease-aware pairing. We curated the largest CT +dataset to date, comprising imaging and report data from 69,086 patients, and +conducted a comprehensive evaluation of 54 major and important disease +diagnosis tasks across 15 main anatomies. Experimental results demonstrate the +substantial potential of fVLM in versatile medical image interpretation. In the +zero-shot classification task, we achieved an average AUC of 81.3% on 54 +diagnosis tasks, surpassing CLIP and supervised methods by 12.9% and 8.0%, +respectively. + +
+
+ comment: Accepted by ICLR 2025 +
+
+
+
+
+ + ☆ Leveraging ChatGPT's Multimodal Vision Capabilities to Rank Satellite + Images by Poverty Level: Advancing Tools for Social Science Research + + +
+ This paper investigates the novel application of Large Language Models (LLMs) +with vision capabilities to analyze satellite imagery for village-level poverty +prediction. Although LLMs were originally designed for natural language +understanding, their adaptability to multimodal tasks, including geospatial +analysis, has opened new frontiers in data-driven research. By leveraging +advancements in vision-enabled LLMs, we assess their ability to provide +interpretable, scalable, and reliable insights into human poverty from +satellite images. Using a pairwise comparison approach, we demonstrate that +ChatGPT can rank satellite images based on poverty levels with accuracy +comparable to domain experts. These findings highlight both the promise and the +limitations of LLMs in socioeconomic research, providing a foundation for their +integration into poverty assessment workflows. This study contributes to the +ongoing exploration of unconventional data sources for welfare analysis and +opens pathways for cost-effective, large-scale poverty monitoring. + +
+
+
+
+
+ + ☆ Rethinking Encoder-Decoder Flow Through Shared Structures + + +
+ Dense prediction tasks have enjoyed a growing complexity of encoder +architectures, decoders, however, have remained largely the same. They rely on +individual blocks decoding intermediate feature maps sequentially. We introduce +banks, shared structures that are used by each decoding block to provide +additional context in the decoding process. These structures, through applying +them via resampling and feature fusion, improve performance on depth estimation +for state-of-the-art transformer-based architectures on natural and synthetic +images whilst training on large-scale datasets. + +
+
+
+
+
+ + ☆ Trick-GS: A Balanced Bag of Tricks for Efficient Gaussian Splatting + + +
+ Gaussian splatting (GS) for 3D reconstruction has become quite popular due to +their fast training, inference speeds and high quality reconstruction. However, +GS-based reconstructions generally consist of millions of Gaussians, which +makes them hard to use on computationally constrained devices such as +smartphones. In this paper, we first propose a principled analysis of advances +in efficient GS methods. Then, we propose Trick-GS, which is a careful +combination of several strategies including (1) progressive training with +resolution, noise and Gaussian scales, (2) learning to prune and mask +primitives and SH bands by their significance, and (3) accelerated GS training +framework. Trick-GS takes a large step towards resource-constrained GS, where +faster run-time, smaller and faster-convergence of models is of paramount +concern. Our results on three datasets show that Trick-GS achieves up to 2x +faster training, 40x smaller disk size and 2x faster rendering speed compared +to vanilla GS, while having comparable accuracy. + +
+
+ comment: Accepted at ICASSP'25 +
+
+
+
+
+ + ☆ CheapNVS: Real-Time On-Device Narrow-Baseline Novel View Synthesis + + +
+ Single-view novel view synthesis (NVS) is a notorious problem due to its +ill-posed nature, and often requires large, computationally expensive +approaches to produce tangible results. In this paper, we propose CheapNVS: a +fully end-to-end approach for narrow baseline single-view NVS based on a novel, +efficient multiple encoder/decoder design trained in a multi-stage fashion. +CheapNVS first approximates the laborious 3D image warping with lightweight +learnable modules that are conditioned on the camera pose embeddings of the +target view, and then performs inpainting on the occluded regions in parallel +to achieve significant performance gains. Once trained on a subset of Open +Images dataset, CheapNVS outperforms the state-of-the-art despite being 10 +times faster and consuming 6% less memory. Furthermore, CheapNVS runs +comfortably in real-time on mobile devices, reaching over 30 FPS on a Samsung +Tab 9+. + +
+
+ comment: Accepted to ICASSP 2025 +
+
+
+
+
+ + ☆ Training-Free Style and Content Transfer by Leveraging U-Net Skip + Connections in Stable Diffusion 2.* + + +
+ Despite significant recent advances in image generation with diffusion +models, their internal latent representations remain poorly understood. +Existing works focus on the bottleneck layer (h-space) of Stable Diffusion's +U-Net or leverage the cross-attention, self-attention, or decoding layers. Our +model, SkipInject takes advantage of U-Net's skip connections. We conduct +thorough analyses on the role of the skip connections and find that the +residual connections passed by the third encoder block carry most of the +spatial information of the reconstructed image, splitting the content from the +style. We show that injecting the representations from this block can be used +for text-based editing, precise modifications, and style transfer. We compare +our methods state-of-the-art style transfer and image editing methods and +demonstrate that our method obtains the best content alignment and optimal +structural preservation tradeoff. + +
+
+
+
+
+ + ☆ Scene Understanding Enabled Semantic Communication with Open Channel + Coding + + +
+ As communication systems transition from symbol transmission to conveying +meaningful information, sixth-generation (6G) networks emphasize semantic +communication. This approach prioritizes high-level semantic information, +improving robustness and reducing redundancy across modalities like text, +speech, and images. However, traditional semantic communication faces +limitations, including static coding strategies, poor generalization, and +reliance on task-specific knowledge bases that hinder adaptability. To overcome +these challenges, we propose a novel system combining scene understanding, +Large Language Models (LLMs), and open channel coding, named \textbf{OpenSC}. +Traditional systems rely on fixed domain-specific knowledge bases, limiting +their ability to generalize. Our open channel coding approach leverages shared, +publicly available knowledge, enabling flexible, adaptive encoding. This +dynamic system reduces reliance on static task-specific data, enhancing +adaptability across diverse tasks and environments. Additionally, we use scene +graphs for structured semantic encoding, capturing object relationships and +context to improve tasks like Visual Question Answering (VQA). Our approach +selectively encodes key semantic elements, minimizing redundancy and improving +transmission efficiency. Experimental results show significant improvements in +both semantic understanding and efficiency, advancing the potential of +adaptive, generalizable semantic communication in 6G networks. + +
+
+
+
+
+ + ☆ PARASIDE: An Automatic Paranasal Sinus Segmentation and Structure + Analysis Tool for MRI + + +
+ Chronic rhinosinusitis (CRS) is a common and persistent sinus imflammation +that affects 5 - 12\% of the general population. It significantly impacts +quality of life and is often difficult to assess due to its subjective nature +in clinical evaluation. We introduce PARASIDE, an automatic tool for segmenting +air and soft tissue volumes of the structures of the sinus maxillaris, +frontalis, sphenodalis and ethmoidalis in T1 MRI. By utilizing that +segmentation, we can quantify feature relations that have been observed only +manually and subjectively before. We performed an exemplary study and showed +both volume and intensity relations between structures and radiology reports. +While the soft tissue segmentation is good, the automated annotations of the +air volumes are excellent. The average intensity over air structures are +consistently below those of the soft tissues, close to perfect separability. +Healthy subjects exhibit lower soft tissue volumes and lower intensities. Our +developed system is the first automated whole nasal segmentation of 16 +structures, and capable of calculating medical relevant features such as the +Lund-Mackay score. + +
+
+
+
+
+ + ☆ Deep-BrownConrady: Prediction of Camera Calibration and Distortion + Parameters Using Deep Learning and Synthetic Data + + +
+ This research addresses the challenge of camera calibration and distortion +parameter prediction from a single image using deep learning models. The main +contributions of this work are: (1) demonstrating that a deep learning model, +trained on a mix of real and synthetic images, can accurately predict camera +and lens parameters from a single image, and (2) developing a comprehensive +synthetic dataset using the AILiveSim simulation platform. This dataset +includes variations in focal length and lens distortion parameters, providing a +robust foundation for model training and testing. The training process +predominantly relied on these synthetic images, complemented by a small subset +of real images, to explore how well models trained on synthetic data can +perform calibration tasks on real-world images. Traditional calibration methods +require multiple images of a calibration object from various orientations, +which is often not feasible due to the lack of such images in publicly +available datasets. A deep learning network based on the ResNet architecture +was trained on this synthetic dataset to predict camera calibration parameters +following the Brown-Conrady lens model. The ResNet architecture, adapted for +regression tasks, is capable of predicting continuous values essential for +accurate camera calibration in applications such as autonomous driving, +robotics, and augmented reality. + Keywords: Camera calibration, distortion, synthetic data, deep learning, +residual networks (ResNet), AILiveSim, horizontal field-of-view, principal +point, Brown-Conrady Model. + +
+
+
+
+
+ + ☆ LiDAR-Based Vehicle Detection and Tracking for Autonomous Racing + + +
+ Autonomous racing provides a controlled environment for testing the software +and hardware of autonomous vehicles operating at their performance limits. +Competitive interactions between multiple autonomous racecars however introduce +challenging and potentially dangerous scenarios. Accurate and consistent +vehicle detection and tracking is crucial for overtaking maneuvers, and +low-latency sensor processing is essential to respond quickly to hazardous +situations. This paper presents the LiDAR-based perception algorithms deployed +on Team PoliMOVE's autonomous racecar, which won multiple competitions in the +Indy Autonomous Challenge series. Our Vehicle Detection and Tracking pipeline +is composed of a novel fast Point Cloud Segmentation technique and a specific +Vehicle Pose Estimation methodology, together with a variable-step Multi-Target +Tracking algorithm. Experimental results demonstrate the algorithm's +performance, robustness, computational efficiency, and suitability for +autonomous racing applications, enabling fully autonomous overtaking maneuvers +at velocities exceeding 275 km/h. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ A Note on Implementation Errors in Recent Adaptive Attacks Against + Multi-Resolution Self-Ensembles + + +
+ This note documents an implementation issue in recent adaptive attacks (Zhang +et al. [2024]) against the multi-resolution self-ensemble defense (Fort and +Lakshminarayanan [2024]). The implementation allowed adversarial perturbations +to exceed the standard $L_\infty = 8/255$ bound by up to a factor of +20$\times$, reaching magnitudes of up to $L_\infty = 160/255$. When attacks are +properly constrained within the intended bounds, the defense maintains +non-trivial robustness. Beyond highlighting the importance of careful +validation in adversarial machine learning research, our analysis reveals an +intriguing finding: properly bounded adaptive attacks against strong +multi-resolution self-ensembles often align with human perception, suggesting +the need to reconsider how we measure adversarial robustness. + +
+
+ comment: 4 pages, 2 figures, technical note addressing an issue in + arXiv:2411.14834v1 +
+
+
+
+
+ + ☆ BILLNET: A Binarized Conv3D-LSTM Network with Logic-gated residual + architecture for hardware-efficient video inference + + +
+ Long Short-Term Memory (LSTM) and 3D convolution (Conv3D) show impressive +results for many video-based applications but require large memory and +intensive computing. Motivated by recent works on hardware-algorithmic +co-design towards efficient inference, we propose a compact binarized +Conv3D-LSTM model architecture called BILLNET, compatible with a highly +resource-constrained hardware. Firstly, BILLNET proposes to factorize the +costly standard Conv3D by two pointwise convolutions with a grouped convolution +in-between. Secondly, BILLNET enables binarized weights and activations via a +MUX-OR-gated residual architecture. Finally, to efficiently train BILLNET, we +propose a multi-stage training strategy enabling to fully quantize LSTM layers. +Results on Jester dataset show that our method can obtain high accuracy with +extremely low memory and computational budgets compared to existing Conv3D +resource-efficient models. + +
+
+ comment: Published at IEEE SiPS 2022 +
+
+
+
+
+ + ☆ Registration of Longitudinal Liver Examinations for Tumor Progress + Assessment + + +
+ Assessing cancer progression in liver CT scans is a clinical challenge, +requiring a comparison of scans at different times for the same patient. +Practitioners must identify existing tumors, compare them with prior exams, +identify new tumors, and evaluate overall disease evolution. This process is +particularly complex in liver examinations due to misalignment between exams +caused by several factors. Indeed, longitudinal liver examinations can undergo +different non-pathological and pathological changes due to non-rigid +deformations, the appearance or disappearance of pathologies, and other +variations. In such cases, existing registration approaches, mainly based on +intrinsic features may distort tumor regions, biasing the tumor progress +evaluation step and the corresponding diagnosis. This work proposes a +registration method based only on geometrical and anatomical information from +liver segmentation, aimed at aligning longitudinal liver images for aided +diagnosis. The proposed method is trained and tested on longitudinal liver CT +scans, with 317 patients for training and 53 for testing. Our experimental +results support our claims by showing that our method is better than other +registration techniques by providing a smoother deformation while preserving +the tumor burden (total volume of tissues considered as tumor) within the +volume. Qualitative results emphasize the importance of smooth deformations in +preserving tumor appearance. + +
+
+
+
+
+ + ☆ Triple Path Enhanced Neural Architecture Search for Multimodal Fake News + Detection + + +
+ Multimodal fake news detection has become one of the most crucial issues on +social media platforms. Although existing methods have achieved advanced +performance, two main challenges persist: (1) Under-performed multimodal news +information fusion due to model architecture solidification, and (2) weak +generalization ability on partial-modality contained fake news. To meet these +challenges, we propose a novel and flexible triple path enhanced neural +architecture search model MUSE. MUSE includes two dynamic paths for detecting +partial-modality contained fake news and a static path for exploiting potential +multimodal correlations. Experimental results show that MUSE achieves stable +performance improvement over the baselines. + +
+
+ comment: This paper has been accepted into the IEEE International Conference + on Acoustics, Speech, and Signal Processing(ICASSP 2024) +
+
+
+
+
+ + ☆ Optimizing Human Pose Estimation Through Focused Human and Joint Regions + + +
+ Human pose estimation has given rise to a broad spectrum of novel and +compelling applications, including action recognition, sports analysis, as well +as surveillance. However, accurate video pose estimation remains an open +challenge. One aspect that has been overlooked so far is that existing methods +learn motion clues from all pixels rather than focusing on the target human +body, making them easily misled and disrupted by unimportant information such +as background changes or movements of other people. Additionally, while the +current Transformer-based pose estimation methods has demonstrated impressive +performance with global modeling, they struggle with local context perception +and precise positional identification. In this paper, we try to tackle these +challenges from three aspects: (1) We propose a bilayer Human-Keypoint Mask +module that performs coarse-to-fine visual token refinement, which gradually +zooms in on the target human body and keypoints while masking out unimportant +figure regions. (2) We further introduce a novel deformable cross attention +mechanism and a bidirectional separation strategy to adaptively aggregate +spatial and temporal motion clues from constrained surrounding contexts. (3) We +mathematically formulate the deformable cross attention, constraining that the +model focuses solely on the regions centered at the target person body. +Empirically, our method achieves state-of-the-art performance on three +large-scale benchmark datasets. A remarkable highlight is that our method +achieves an 84.8 mean Average Precision (mAP) on the challenging wrist joint, +which significantly outperforms the 81.5 mAP achieved by the current +state-of-the-art method on the PoseTrack2017 dataset. + +
+
+
+
+
+ + ☆ Context-CrackNet: A Context-Aware Framework for Precise Segmentation of + Tiny Cracks in Pavement images + + +
+ The accurate detection and segmentation of pavement distresses, particularly +tiny and small cracks, are critical for early intervention and preventive +maintenance in transportation infrastructure. Traditional manual inspection +methods are labor-intensive and inconsistent, while existing deep learning +models struggle with fine-grained segmentation and computational efficiency. To +address these challenges, this study proposes Context-CrackNet, a novel +encoder-decoder architecture featuring the Region-Focused Enhancement Module +(RFEM) and Context-Aware Global Module (CAGM). These innovations enhance the +model's ability to capture fine-grained local details and global contextual +dependencies, respectively. Context-CrackNet was rigorously evaluated on ten +publicly available crack segmentation datasets, covering diverse pavement +distress scenarios. The model consistently outperformed 9 state-of-the-art +segmentation frameworks, achieving superior performance metrics such as mIoU +and Dice score, while maintaining competitive inference efficiency. Ablation +studies confirmed the complementary roles of RFEM and CAGM, with notable +improvements in mIoU and Dice score when both modules were integrated. +Additionally, the model's balance of precision and computational efficiency +highlights its potential for real-time deployment in large-scale pavement +monitoring systems. + +
+
+
+
+
+ + ☆ Kolmogorov Arnold Neural Interpolator for Downscaling and Correcting + Meteorological Fields from In-Situ Observations + + +
+ Obtaining accurate weather forecasts at station locations is a critical +challenge due to systematic biases arising from the mismatch between +multi-scale, continuous atmospheric characteristic and their discrete, gridded +representations. Previous works have primarily focused on modeling gridded +meteorological data, inherently neglecting the off-grid, continuous nature of +atmospheric states and leaving such biases unresolved. To address this, we +propose the Kolmogorov Arnold Neural Interpolator (KANI), a novel framework +that redefines meteorological field representation as continuous neural +functions derived from discretized grids. Grounded in the Kolmogorov Arnold +theorem, KANI captures the inherent continuity of atmospheric states and +leverages sparse in-situ observations to correct these biases systematically. +Furthermore, KANI introduces an innovative zero-shot downscaling capability, +guided by high-resolution topographic textures without requiring +high-resolution meteorological fields for supervision. Experimental results +across three sub-regions of the continental United States indicate that KANI +achieves an accuracy improvement of 40.28% for temperature and 67.41% for wind +speed, highlighting its significant improvement over traditional interpolation +methods. This enables continuous neural representation of meteorological +variables through neural networks, transcending the limitations of conventional +grid-based representations. + +
+
+
+
+
+ + ☆ CVOCSemRPL: Class-Variance Optimized Clustering, Semantic Information + Injection and Restricted Pseudo Labeling based Improved Semi-Supervised + Few-Shot Learning + + +
+ Few-shot learning has been extensively explored to address problems where the +amount of labeled samples is very limited for some classes. In the +semi-supervised few-shot learning setting, substantial quantities of unlabeled +samples are available. Such unlabeled samples are generally cheaper to obtain +and can be used to improve the few-shot learning performance of the model. Some +of the recent methods for this setting rely on clustering to generate +pseudo-labels for the unlabeled samples. Since the quality of the +representation learned by the model heavily influences the effectiveness of +clustering, this might also lead to incorrect labeling of the unlabeled samples +and consequently lead to a drop in the few-shot learning performance. We +propose an approach for semi-supervised few-shot learning that performs a +class-variance optimized clustering in order to improve the effectiveness of +clustering the labeled and unlabeled samples in this setting. It also optimizes +the clustering-based pseudo-labeling process using a restricted pseudo-labeling +approach and performs semantic information injection in order to improve the +semi-supervised few-shot learning performance of the model. We experimentally +demonstrate that our proposed approach significantly outperforms recent +state-of-the-art methods on the benchmark datasets. + +
+
+
+
+
+ + ☆ ECTIL: Label-efficient Computational Tumour Infiltrating Lymphocyte + (TIL) assessment in breast cancer: Multicentre validation in 2,340 patients + with breast cancer + + +
+ The level of tumour-infiltrating lymphocytes (TILs) is a prognostic factor +for patients with (triple-negative) breast cancer (BC). Computational TIL +assessment (CTA) has the potential to assist pathologists in this +labour-intensive task, but current CTA models rely heavily on many detailed +annotations. We propose and validate a fundamentally simpler deep learning +based CTA that can be trained in only ten minutes on hundredfold fewer +pathologist annotations. We collected whole slide images (WSIs) with TILs +scores and clinical data of 2,340 patients with BC from six cohorts including +three randomised clinical trials. Morphological features were extracted from +whole slide images (WSIs) using a pathology foundation model. Our +label-efficient Computational stromal TIL assessment model (ECTIL) directly +regresses the TILs score from these features. ECTIL trained on only a few +hundred samples (ECTIL-TCGA) showed concordance with the pathologist over five +heterogeneous external cohorts (r=0.54-0.74, AUROC=0.80-0.94). Training on all +slides of five cohorts (ECTIL-combined) improved results on a held-out test set +(r=0.69, AUROC=0.85). Multivariable Cox regression analyses indicated that +every 10% increase of ECTIL scores was associated with improved overall +survival independent of clinicopathological variables (HR 0.86, p<0.01), +similar to the pathologist score (HR 0.87, p<0.001). We demonstrate that ECTIL +is highly concordant with an expert pathologist and obtains a similar hazard +ratio. ECTIL has a fundamentally simpler design than existing methods and can +be trained on orders of magnitude fewer annotations. Such a CTA may be used to +pre-screen patients for, e.g., immunotherapy clinical trial inclusion, or as a +tool to assist clinicians in the diagnostic work-up of patients with BC. Our +model is available under an open source licence +(https://github.com/nki-ai/ectil). + +
+
+ comment: Under review. 54 pages including supplementary materials, 2 main + tables, 3 main figures, 14 supplementary figures, 4 supplementary tables +
+
+
+
+
+ + ☆ Low-rank Prompt Interaction for Continual Vision-Language Retrieval + + +
+ Research on continual learning in multi-modal tasks has been receiving +increasing attention. However, most existing work overlooks the explicit +cross-modal and cross-task interactions. In this paper, we innovatively propose +the Low-rank Prompt Interaction (LPI) to address this general problem of +multi-modal understanding, which considers both cross-modal and cross-task +interactions. Specifically, as for the former, we employ multi-modal +correlation modules for corresponding Transformer layers. Considering that the +training parameters scale to the number of layers and tasks, we propose +low-rank interaction-augmented decomposition to avoid memory explosion while +enhancing the cross-modal association through sharing and separating +common-specific low-rank factors. In addition, due to the multi-modal semantic +differences carried by the low-rank initialization, we adopt hierarchical +low-rank contrastive learning to ensure training robustness. As for the latter, +we initially employ a visual analysis and identify that different tasks have +clear distinctions in proximity. Therefore, we introduce explicit task +contrastive constraints in the prompt learning process based on task semantic +distances. Experiments on two retrieval tasks show performance improvements +with the introduction of a minimal number of parameters, demonstrating the +effectiveness of our method. Code is available at +https://github.com/Kelvin-ywc/LPI. + +
+
+
+
+
+ + ☆ Causal-Inspired Multitask Learning for Video-Based Human Pose Estimation + + +
+ Video-based human pose estimation has long been a fundamental yet challenging +problem in computer vision. Previous studies focus on spatio-temporal modeling +through the enhancement of architecture design and optimization strategies. +However, they overlook the causal relationships in the joints, leading to +models that may be overly tailored and thus estimate poorly to challenging +scenes. Therefore, adequate causal reasoning capability, coupled with good +interpretability of model, are both indispensable and prerequisite for +achieving reliable results. In this paper, we pioneer a causal perspective on +pose estimation and introduce a causal-inspired multitask learning framework, +consisting of two stages. \textit{In the first stage}, we try to endow the +model with causal spatio-temporal modeling ability by introducing two +self-supervision auxiliary tasks. Specifically, these auxiliary tasks enable +the network to infer challenging keypoints based on observed keypoint +information, thereby imbuing causal reasoning capabilities into the model and +making it robust to challenging scenes. \textit{In the second stage}, we argue +that not all feature tokens contribute equally to pose estimation. Prioritizing +causal (keypoint-relevant) tokens is crucial to achieve reliable results, which +could improve the interpretability of the model. To this end, we propose a +Token Causal Importance Selection module to identify the causal tokens and +non-causal tokens (\textit{e.g.}, background and objects). Additionally, +non-causal tokens could provide potentially beneficial cues but may be +redundant. We further introduce a non-causal tokens clustering module to merge +the similar non-causal tokens. Extensive experiments show that our method +outperforms state-of-the-art methods on three large-scale benchmark datasets. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ Correlation-Based Band Selection for Hyperspectral Image Classification + + +
+ Hyperspectral images offer extensive spectral information about ground +objects across multiple spectral bands. However, the large volume of data can +pose challenges during processing. Typically, adjacent bands in hyperspectral +data are highly correlated, leading to the use of only a few selected bands for +various applications. In this work, we present a correlation-based band +selection approach for hyperspectral image classification. Our approach +calculates the average correlation between bands using correlation coefficients +to identify the relationships among different bands. Afterward, we select a +subset of bands by analyzing the average correlation and applying a +threshold-based method. This allows us to isolate and retain bands that exhibit +lower inter-band dependencies, ensuring that the selected bands provide diverse +and non-redundant information. We evaluate our proposed approach on two +standard benchmark datasets: Pavia University (PA) and Salinas Valley (SA), +focusing on image classification tasks. The experimental results demonstrate +that our method performs competitively with other standard band selection +approaches. + +
+
+ comment: 5 pages, 1 figure +
+
+
+
+
+ + ☆ Automatic detection and prediction of nAMD activity change in retinal + OCT using Siamese networks and Wasserstein Distance for ordinality + + +
+ Neovascular age-related macular degeneration (nAMD) is a leading cause of +vision loss among older adults, where disease activity detection and +progression prediction are critical for nAMD management in terms of timely drug +administration and improving patient outcomes. Recent advancements in deep +learning offer a promising solution for predicting changes in AMD from optical +coherence tomography (OCT) retinal volumes. In this work, we proposed deep +learning models for the two tasks of the public MARIO Challenge at MICCAI 2024, +designed to detect and forecast changes in nAMD severity with longitudinal +retinal OCT. For the first task, we employ a Vision Transformer (ViT) based +Siamese Network to detect changes in AMD severity by comparing scan embeddings +of a patient from different time points. To train a model to forecast the +change after 3 months, we exploit, for the first time, an Earth Mover +(Wasserstein) Distance-based loss to harness the ordinal relation within the +severity change classes. Both models ranked high on the preliminary +leaderboard, demonstrating that their predictive capabilities could facilitate +nAMD treatment management. + +
+
+ comment: Solution to the MICCAI 2024 MARIO Challange. First 3 authors + contributed equally. Models can be found at + https://github.com/EmreTaha/Siamese-EMD-for-AMD-Change +
+
+
+
+
+ + ☆ Scalable Benchmarking and Robust Learning for Noise-Free Ego-Motion and + 3D Reconstruction from Noisy Video ICLR 2025 + + +
+ We aim to redefine robust ego-motion estimation and photorealistic 3D +reconstruction by addressing a critical limitation: the reliance on noise-free +data in existing models. While such sanitized conditions simplify evaluation, +they fail to capture the unpredictable, noisy complexities of real-world +environments. Dynamic motion, sensor imperfections, and synchronization +perturbations lead to sharp performance declines when these models are deployed +in practice, revealing an urgent need for frameworks that embrace and excel +under real-world noise. To bridge this gap, we tackle three core challenges: +scalable data generation, comprehensive benchmarking, and model robustness +enhancement. First, we introduce a scalable noisy data synthesis pipeline that +generates diverse datasets simulating complex motion, sensor imperfections, and +synchronization errors. Second, we leverage this pipeline to create +Robust-Ego3D, a benchmark rigorously designed to expose noise-induced +performance degradation, highlighting the limitations of current learning-based +methods in ego-motion accuracy and 3D reconstruction quality. Third, we propose +Correspondence-guided Gaussian Splatting (CorrGS), a novel test-time adaptation +method that progressively refines an internal clean 3D representation by +aligning noisy observations with rendered RGB-D frames from clean 3D map, +enhancing geometric alignment and appearance restoration through visual +correspondence. Extensive experiments on synthetic and real-world data +demonstrate that CorrGS consistently outperforms prior state-of-the-art +methods, particularly in scenarios involving rapid motion and dynamic +illumination. + +
+
+ comment: Accepted by ICLR 2025; 92 Pages; Project Repo: + https://github.com/Xiaohao-Xu/SLAM-under-Perturbation. arXiv admin note: + substantial text overlap with arXiv:2406.16850 +
+
+
+
+
+ + ☆ Nautilus: Locality-aware Autoencoder for Scalable Mesh Generation + + +
+ Triangle meshes are fundamental to 3D applications, enabling efficient +modification and rasterization while maintaining compatibility with standard +rendering pipelines. However, current automatic mesh generation methods +typically rely on intermediate representations that lack the continuous surface +quality inherent to meshes. Converting these representations into meshes +produces dense, suboptimal outputs. Although recent autoregressive approaches +demonstrate promise in directly modeling mesh vertices and faces, they are +constrained by the limitation in face count, scalability, and structural +fidelity. To address these challenges, we propose Nautilus, a locality-aware +autoencoder for artist-like mesh generation that leverages the local properties +of manifold meshes to achieve structural fidelity and efficient representation. +Our approach introduces a novel tokenization algorithm that preserves face +proximity relationships and compresses sequence length through locally shared +vertices and edges, enabling the generation of meshes with an unprecedented +scale of up to 5,000 faces. Furthermore, we develop a Dual-stream Point +Conditioner that provides multi-scale geometric guidance, ensuring global +consistency and local structural fidelity by capturing fine-grained geometric +features. Extensive experiments demonstrate that Nautilus significantly +outperforms state-of-the-art methods in both fidelity and scalability. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ PAID: A Framework of Product-Centric Advertising Image Design + + +
+ In E-commerce platforms, a full advertising image is composed of a background +image and marketing taglines. Automatic ad image design reduces human costs and +plays a crucial role. For the convenience of users, a novel automatic framework +named Product-Centric Advertising Image Design (PAID) is proposed in this work. +PAID takes the product foreground image, required taglines, and target size as +input and creates an ad image automatically. PAID consists of four sequential +stages: prompt generation, layout generation, background image generation, and +graphics rendering. Different expert models are trained to conduct these +sub-tasks. A visual language model (VLM) based prompt generation model is +leveraged to produce a product-matching background prompt. The layout +generation model jointly predicts text and image layout according to the +background prompt, product, and taglines to achieve the best harmony. An +SDXL-based layout-controlled inpainting model is trained to generate an +aesthetic background image. Previous ad image design methods take a background +image as input and then predict the layout of taglines, which limits the +spatial layout due to fixed image content. Innovatively, our PAID adjusts the +stages to produce an unrestricted layout. To complete the PAID framework, we +created two high-quality datasets, PITA and PIL. Extensive experimental results +show that PAID creates more visually pleasing advertising images than previous +methods. + +
+
+
+
+
+ + ☆ BrainGuard: Privacy-Preserving Multisubject Image Reconstructions from + Brain Activities + + +
+ Reconstructing perceived images from human brain activity forms a crucial +link between human and machine learning through Brain-Computer Interfaces. +Early methods primarily focused on training separate models for each individual +to account for individual variability in brain activity, overlooking valuable +cross-subject commonalities. Recent advancements have explored multisubject +methods, but these approaches face significant challenges, particularly in data +privacy and effectively managing individual variability. To overcome these +challenges, we introduce BrainGuard, a privacy-preserving collaborative +training framework designed to enhance image reconstruction from multisubject +fMRI data while safeguarding individual privacy. BrainGuard employs a +collaborative global-local architecture where individual models are trained on +each subject's local data and operate in conjunction with a shared global model +that captures and leverages cross-subject patterns. This architecture +eliminates the need to aggregate fMRI data across subjects, thereby ensuring +privacy preservation. To tackle the complexity of fMRI data, BrainGuard +integrates a hybrid synchronization strategy, enabling individual models to +dynamically incorporate parameters from the global model. By establishing a +secure and collaborative training environment, BrainGuard not only protects +sensitive brain data but also improves the image reconstructions accuracy. +Extensive experiments demonstrate that BrainGuard sets a new benchmark in both +high-level and low-level metrics, advancing the state-of-the-art in brain +decoding through its innovative design. + +
+
+ comment: AAAI 2025 oral +
+
+
+
+
+ + ☆ Learning Primitive Relations for Compositional Zero-Shot Learning + + +
+ Compositional Zero-Shot Learning (CZSL) aims to identify unseen state-object +compositions by leveraging knowledge learned from seen compositions. Existing +approaches often independently predict states and objects, overlooking their +relationships. In this paper, we propose a novel framework, learning primitive +relations (LPR), designed to probabilistically capture the relationships +between states and objects. By employing the cross-attention mechanism, LPR +considers the dependencies between states and objects, enabling the model to +infer the likelihood of unseen compositions. Experimental results demonstrate +that LPR outperforms state-of-the-art methods on all three CZSL benchmark +datasets in both closed-world and open-world settings. Through qualitative +analysis, we show that LPR leverages state-object relationships for unseen +composition prediction. + +
+
+ comment: Accepted to ICASSP 2025 +
+
+
+
+
+ + ☆ Additive Manufacturing Processes Protocol Prediction by Artificial + Intelligence using X-ray Computed Tomography data + + +
+ The quality of the part fabricated from the Additive Manufacturing (AM) +process depends upon the process parameters used, and therefore, optimization +is required for apt quality. A methodology is proposed to set these parameters +non-iteratively without human intervention. It utilizes Artificial Intelligence +(AI) to fully automate the process, with the capability to self-train any apt +AI model by further assimilating the training data.This study includes three +commercially available 3D printers for soft material printing based on the +Material Extrusion (MEX) AM process. The samples are 3D printed for six +different AM process parameters obtained by varying layer height and nozzle +speed. The novelty part of the methodology is incorporating an AI-based image +segmentation step in the decision-making stage that uses quality inspected +training data from the Non-Destructive Testing (NDT) method. The performance of +the trained AI model is compared with the two software tools based on the +classical thresholding method. The AI-based Artificial Neural Network (ANN) +model is trained from NDT-assessed and AI-segmented data to automate the +selection of optimized process parameters. The AI-based model is 99.3 % +accurate, while the best available commercial classical image method is 83.44 % +accurate. The best value of overall R for training ANN is 0.82. The MEX process +gives a 22.06 % porosity error relative to the design. The NDT-data trained two +AI models integrated into a series pipeline for optimal process parameters are +proposed and verified by classical optimization and mechanical testing methods. + +
+
+ comment: 21 pages, 21 figures, 5 tables +
+
+
+
+
+ + ☆ TD-RD: A Top-Down Benchmark with Real-Time Framework for Road Damage + Detection + + +
+ Object detection has witnessed remarkable advancements over the past decade, +largely driven by breakthroughs in deep learning and the proliferation of large +scale datasets. However, the domain of road damage detection remains relatively +under explored, despite its critical significance for applications such as +infrastructure maintenance and road safety. This paper addresses this gap by +introducing a novel top down benchmark that offers a complementary perspective +to existing datasets, specifically tailored for road damage detection. Our +proposed Top Down Road Damage Detection Dataset (TDRD) includes three primary +categories of road damage cracks, potholes, and patches captured from a top +down viewpoint. The dataset consists of 7,088 high resolution images, +encompassing 12,882 annotated instances of road damage. Additionally, we +present a novel real time object detection framework, TDYOLOV10, designed to +handle the unique challenges posed by the TDRD dataset. Comparative studies +with state of the art models demonstrate competitive baseline results. By +releasing TDRD, we aim to accelerate research in this crucial area. A sample of +the dataset will be made publicly available upon the paper's acceptance. + +
+
+
+
+
+ + ☆ Snapshot multi-spectral imaging through defocusing and a Fourier imager + network + + +
+ Multi-spectral imaging, which simultaneously captures the spatial and +spectral information of a scene, is widely used across diverse fields, +including remote sensing, biomedical imaging, and agricultural monitoring. +Here, we introduce a snapshot multi-spectral imaging approach employing a +standard monochrome image sensor with no additional spectral filters or +customized components. Our system leverages the inherent chromatic aberration +of wavelength-dependent defocusing as a natural source of physical encoding of +multi-spectral information; this encoded image information is rapidly decoded +via a deep learning-based multi-spectral Fourier Imager Network (mFIN). We +experimentally tested our method with six illumination bands and demonstrated +an overall accuracy of 92.98% for predicting the illumination channels at the +input and achieved a robust multi-spectral image reconstruction on various test +objects. This deep learning-powered framework achieves high-quality +multi-spectral image reconstruction using snapshot image acquisition with a +monochrome image sensor and could be useful for applications in biomedicine, +industrial quality control, and agriculture, among others. + +
+
+ comment: 22 Pages, 7 Figures +
+
+
+
+
+ + ☆ Deep Learning-Powered Classification of Thoracic Diseases in Chest + X-Rays + + +
+ Chest X-rays play a pivotal role in diagnosing respiratory diseases such as +pneumonia, tuberculosis, and COVID-19, which are prevalent and present unique +diagnostic challenges due to overlapping visual features and variability in +image quality. Severe class imbalance and the complexity of medical images +hinder automated analysis. This study leverages deep learning techniques, +including transfer learning on pre-trained models (AlexNet, ResNet, and +InceptionNet), to enhance disease detection and classification. By fine-tuning +these models and incorporating focal loss to address class imbalance, +significant performance improvements were achieved. Grad-CAM visualizations +further enhance model interpretability, providing insights into clinically +relevant regions influencing predictions. The InceptionV3 model, for instance, +achieved a 28% improvement in AUC and a 15% increase in F1-Score. These +findings highlight the potential of deep learning to improve diagnostic +workflows and support clinical decision-making. + +
+
+
+
+
+ + ☆ Dense-SfM: Structure from Motion with Dense Consistent Matching + + +
+ We present Dense-SfM, a novel Structure from Motion (SfM) framework designed +for dense and accurate 3D reconstruction from multi-view images. Sparse +keypoint matching, which traditional SfM methods often rely on, limits both +accuracy and point density, especially in texture-less areas. Dense-SfM +addresses this limitation by integrating dense matching with a Gaussian +Splatting (GS) based track extension which gives more consistent, longer +feature tracks. To further improve reconstruction accuracy, Dense-SfM is +equipped with a multi-view kernelized matching module leveraging transformer +and Gaussian Process architectures, for robust track refinement across +multi-views. Evaluations on the ETH3D and Texture-Poor SfM datasets show that +Dense-SfM offers significant improvements in accuracy and density over +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Global Semantic-Guided Sub-image Feature Weight Allocation in + High-Resolution Large Vision-Language Models + + +
+ As the demand for high-resolution image processing in Large Vision-Language +Models (LVLMs) grows, sub-image partitioning has become a popular approach for +mitigating visual information loss associated with fixed-resolution processing. +However, existing partitioning methods uniformly process sub-images, resulting +in suboptimal image understanding. In this work, we reveal that the sub-images +with higher semantic relevance to the entire image encapsulate richer visual +information for preserving the model's visual understanding ability. Therefore, +we propose the Global Semantic-guided Weight Allocator (GSWA) module, which +dynamically allocates weights to sub-images based on their relative information +density, emulating human visual attention mechanisms. This approach enables the +model to focus on more informative regions, overcoming the limitations of +uniform treatment. We integrate GSWA into the InternVL2-2B framework to create +SleighVL, a lightweight yet high-performing model. Extensive experiments +demonstrate that SleighVL outperforms models with comparable parameters and +remains competitive with larger models. Our work provides a promising direction +for more efficient and contextually aware high-resolution image processing in +LVLMs, advancing multimodal system development. + +
+
+ comment: 10 pages, 10 figures and tables +
+
+
+
+
+ + ☆ Bayesian Neural Networks for One-to-Many Mapping in Image Enhancement + + +
+ In image enhancement tasks, such as low-light and underwater image +enhancement, a degraded image can correspond to multiple plausible target +images due to dynamic photography conditions, such as variations in +illumination. This naturally results in a one-to-many mapping challenge. To +address this, we propose a Bayesian Enhancement Model (BEM) that incorporates +Bayesian Neural Networks (BNNs) to capture data uncertainty and produce diverse +outputs. To achieve real-time inference, we introduce a two-stage approach: +Stage I employs a BNN to model the one-to-many mappings in the low-dimensional +space, while Stage II refines fine-grained image details using a Deterministic +Neural Network (DNN). To accelerate BNN training and convergence, we introduce +a dynamic \emph{Momentum Prior}. Extensive experiments on multiple low-light +and underwater image enhancement benchmarks demonstrate the superiority of our +method over deterministic models. + +
+
+
+
+
+ + ☆ CDI: Blind Image Restoration Fidelity Evaluation based on Consistency + with Degraded Image + + +
+ Recent advancements in Blind Image Restoration (BIR) methods, based on +Generative Adversarial Networks and Diffusion Models, have significantly +improved visual quality. However, they present significant challenges for Image +Quality Assessment (IQA), as the existing Full-Reference IQA methods often rate +images with high perceptual quality poorly. In this paper, we reassess the +Solution Non-Uniqueness and Degradation Indeterminacy issues of BIR, and +propose constructing a specific BIR IQA system. In stead of directly comparing +a restored image with a reference image, the BIR IQA evaluates fidelity by +calculating the Consistency with Degraded Image (CDI). Specifically, we propose +a wavelet domain Reference Guided CDI algorithm, which can acquire the +consistency with a degraded image for various types without requiring knowledge +of degradation parameters. The supported degradation types include down +sampling, blur, noise, JPEG and complex combined degradations etc. In addition, +we propose a Reference Agnostic CDI, enabling BIR fidelity evaluation without +reference images. Finally, in order to validate the rationality of CDI, we +create a new Degraded Images Switch Display Comparison Dataset (DISDCD) for +subjective evaluation of BIR fidelity. Experiments conducted on DISDCD verify +that CDI is markedly superior to common Full Reference IQA methods for BIR +fidelity evaluation. The source code and the DISDCD dataset will be publicly +available shortly. + +
+
+
+
+
+ + ☆ Point-LN: A Lightweight Framework for Efficient Point Cloud + Classification Using Non-Parametric Positional Encoding + + +
+ We introduce Point-LN, a novel lightweight framework engineered for efficient +3D point cloud classification. Point-LN integrates essential non-parametric +components-such as Farthest Point Sampling (FPS), k-Nearest Neighbors (k-NN), +and non-learnable positional encoding-with a streamlined learnable classifier +that significantly enhances classification accuracy while maintaining a minimal +parameter footprint. This hybrid architecture ensures low computational costs +and rapid inference speeds, making Point-LN ideal for real-time and +resource-constrained applications. Comprehensive evaluations on benchmark +datasets, including ModelNet40 and ScanObjectNN, demonstrate that Point-LN +achieves competitive performance compared to state-of-the-art methods, all +while offering exceptional efficiency. These results establish Point-LN as a +robust and scalable solution for diverse point cloud classification tasks, +highlighting its potential for widespread adoption in various computer vision +applications. + +
+
+ comment: This paper has been accepted for presentation at the 29th + International Computer Conference, Computer Society of Iran (CSICC) 2025 +
+
+
+
+
+ + ☆ Micro-macro Wavelet-based Gaussian Splatting for 3D Reconstruction from + Unconstrained Images + + +
+ 3D reconstruction from unconstrained image collections presents substantial +challenges due to varying appearances and transient occlusions. In this paper, +we introduce Micro-macro Wavelet-based Gaussian Splatting (MW-GS), a novel +approach designed to enhance 3D reconstruction by disentangling scene +representations into global, refined, and intrinsic components. The proposed +method features two key innovations: Micro-macro Projection, which allows +Gaussian points to capture details from feature maps across multiple scales +with enhanced diversity; and Wavelet-based Sampling, which leverages frequency +domain information to refine feature representations and significantly improve +the modeling of scene appearances. Additionally, we incorporate a Hierarchical +Residual Fusion Network to seamlessly integrate these features. Extensive +experiments demonstrate that MW-GS delivers state-of-the-art rendering +performance, surpassing existing methods. + +
+
+ comment: 11 pages, 6 figures,accepted by AAAI 2025 +
+
+
+
+
+ + ☆ GreedyPixel: Fine-Grained Black-Box Adversarial Attack Via Greedy + Algorithm + + +
+ A critical requirement for deep learning models is ensuring their robustness +against adversarial attacks. These attacks commonly introduce noticeable +perturbations, compromising the visual fidelity of adversarial examples. +Another key challenge is that while white-box algorithms can generate effective +adversarial perturbations, they require access to the model gradients, limiting +their practicality in many real-world scenarios. Existing attack mechanisms +struggle to achieve similar efficacy without access to these gradients. In this +paper, we introduce GreedyPixel, a novel pixel-wise greedy algorithm designed +to generate high-quality adversarial examples using only query-based feedback +from the target model. GreedyPixel improves computational efficiency in what is +typically a brute-force process by perturbing individual pixels in sequence, +guided by a pixel-wise priority map. This priority map is constructed by +ranking gradients obtained from a surrogate model, providing a structured path +for perturbation. Our results demonstrate that GreedyPixel achieves attack +success rates comparable to white-box methods without the need for gradient +information, and surpasses existing algorithms in black-box settings, offering +higher success rates, reduced computational time, and imperceptible +perturbations. These findings underscore the advantages of GreedyPixel in terms +of attack efficacy, time efficiency, and visual quality. + +
+
+
+
+
+ + ☆ Detection and Classification of Acute Lymphoblastic Leukemia Utilizing + Deep Transfer Learning + + +
+ A mutation in the DNA of a single cell that compromises its function +initiates leukemia,leading to the overproduction of immature white blood cells +that encroach upon the space required for the generation of healthy blood +cells.Leukemia is treatable if identified in its initial stages. However,its +diagnosis is both arduous and time consuming. This study proposes a novel +approach for diagnosing leukemia across four stages Benign,Early,Pre,and Pro +using deep learning techniques.We employed two Convolutional Neural Network +(CNN) models as MobileNetV2 with an altered head and a custom model. The custom +model consists of multiple convolutional layers,each paired with corresponding +max pooling layers.We utilized MobileNetV2 with ImageNet weights,adjusting the +head to integrate the final results.The dataset used is the publicly available +"Acute Lymphoblastic Leukemia (ALL) Image Dataset", and we applied the +Synthetic Minority Oversampling Technique (SMOTE) to augment and balance the +training dataset.The custom model achieved an accuracy of 98.6%, while +MobileNetV2 attained a superior accuracy of 99.69%. The pretrained model showed +promising results,indicating an increased likelihood of real-world application. + +
+
+ comment: 4 pages, 4 figures, Submitted to UCICS +
+
+
+
+
+ + ☆ PuzzleGPT: Emulating Human Puzzle-Solving Ability for Time and Location + Prediction + + +
+ The task of predicting time and location from images is challenging and +requires complex human-like puzzle-solving ability over different clues. In +this work, we formalize this ability into core skills and implement them using +different modules in an expert pipeline called PuzzleGPT. PuzzleGPT consists of +a perceiver to identify visual clues, a reasoner to deduce prediction +candidates, a combiner to combinatorially combine information from different +clues, a web retriever to get external knowledge if the task can't be solved +locally, and a noise filter for robustness. This results in a zero-shot, +interpretable, and robust approach that records state-of-the-art performance on +two datasets -- TARA and WikiTilo. PuzzleGPT outperforms large VLMs such as +BLIP-2, InstructBLIP, LLaVA, and even GPT-4V, as well as automatically +generated reasoning pipelines like VisProg, by at least 32% and 38%, +respectively. It even rivals or surpasses finetuned models. + +
+
+ comment: NAACL 2025 Findings +
+
+
+
+
+ + ☆ You Only Teach Once: Learn One-Shot Bimanual Robotic Manipulation from + Video Demonstrations + + +
+ Bimanual robotic manipulation is a long-standing challenge of embodied +intelligence due to its characteristics of dual-arm spatial-temporal +coordination and high-dimensional action spaces. Previous studies rely on +pre-defined action taxonomies or direct teleoperation to alleviate or +circumvent these issues, often making them lack simplicity, versatility and +scalability. Differently, we believe that the most effective and efficient way +for teaching bimanual manipulation is learning from human demonstrated videos, +where rich features such as spatial-temporal positions, dynamic postures, +interaction states and dexterous transitions are available almost for free. In +this work, we propose the YOTO (You Only Teach Once), which can extract and +then inject patterns of bimanual actions from as few as a single binocular +observation of hand movements, and teach dual robot arms various complex tasks. +Furthermore, based on keyframes-based motion trajectories, we devise a subtle +solution for rapidly generating training demonstrations with diverse variations +of manipulated objects and their locations. These data can then be used to +learn a customized bimanual diffusion policy (BiDP) across diverse scenes. In +experiments, YOTO achieves impressive performance in mimicking 5 intricate +long-horizon bimanual tasks, possesses strong generalization under different +visual and spatial conditions, and outperforms existing visuomotor imitation +learning methods in accuracy and efficiency. Our project link is +https://hnuzhy.github.io/projects/YOTO. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Dynamic Token Reduction during Generation for Vision Language Models + + +
+ Vision-Language Models (VLMs) have achieved notable success in multimodal +tasks but face practical limitations due to the quadratic complexity of decoder +attention mechanisms and autoregressive generation. Existing methods like FASTV +and VTW have achieved notable results in reducing redundant visual tokens, but +these approaches focus on pruning tokens in a single forward pass without +systematically analyzing the redundancy of visual tokens throughout the entire +generation process. In this paper, we introduce a dynamic pruning strategy +tailored for VLMs, namedDynamic Rate (DyRate), which progressively adjusts the +compression rate during generation. Our analysis of the distribution of +attention reveals that the importance of visual tokens decreases throughout the +generation process, inspiring us to adopt a more aggressive compression rate. +By integrating a lightweight predictor based on attention distribution, our +approach enables flexible adjustment of pruning rates based on the attention +distribution. Our experimental results demonstrate that our method not only +reduces computational demands but also maintains the quality of responses. + +
+
+
+
+
+ + ☆ Sparse Mixture-of-Experts for Non-Uniform Noise Reduction in MRI Images + + +
+ Magnetic Resonance Imaging (MRI) is an essential diagnostic tool in clinical +settings, but its utility is often hindered by noise artifacts introduced +during the imaging process.Effective denoising is critical for enhancing image +quality while preserving anatomical structures. However, traditional denoising +methods, which often assume uniform noise distributions, struggle to handle the +non-uniform noise commonly present in MRI images. In this paper, we introduce a +novel approach leveraging a sparse mixture-of-experts framework for MRI image +denoising. Each expert is a specialized denoising convolutional neural network +fine-tuned to target specific noise characteristics associated with different +image regions. Our method demonstrates superior performance over +state-of-the-art denoising techniques on both synthetic and real-world brain +MRI datasets. Furthermore, we show that it generalizes effectively to unseen +datasets, highlighting its robustness and adaptability. + +
+
+ comment: Accepted to the WACV Workshop on Image Quality +
+
+
+
+
+ + ☆ VideoShield: Regulating Diffusion-based Video Generation Models via + Watermarking ICLR + + +
+ Artificial Intelligence Generated Content (AIGC) has advanced significantly, +particularly with the development of video generation models such as +text-to-video (T2V) models and image-to-video (I2V) models. However, like other +AIGC types, video generation requires robust content control. A common approach +is to embed watermarks, but most research has focused on images, with limited +attention given to videos. Traditional methods, which embed watermarks +frame-by-frame in a post-processing manner, often degrade video quality. In +this paper, we propose VideoShield, a novel watermarking framework specifically +designed for popular diffusion-based video generation models. Unlike +post-processing methods, VideoShield embeds watermarks directly during video +generation, eliminating the need for additional training. To ensure video +integrity, we introduce a tamper localization feature that can detect changes +both temporally (across frames) and spatially (within individual frames). Our +method maps watermark bits to template bits, which are then used to generate +watermarked noise during the denoising process. Using DDIM Inversion, we can +reverse the video to its original watermarked noise, enabling straightforward +watermark extraction. Additionally, template bits allow precise detection for +potential temporal and spatial modification. Extensive experiments across +various video models (both T2V and I2V models) demonstrate that our method +effectively extracts watermarks and detects tamper without compromising video +quality. Furthermore, we show that this approach is applicable to image +generation models, enabling tamper detection in generated images as well. Codes +and models are available at +\href{https://github.com/hurunyi/VideoShield}{https://github.com/hurunyi/VideoShield}. + +
+
+ comment: International Conference on Learning Representations (ICLR) 2025 +
+
+
+
+
+ + ☆ ENTER: Event Based Interpretable Reasoning for VideoQA + + +
+ In this paper, we present ENTER, an interpretable Video Question Answering +(VideoQA) system based on event graphs. Event graphs convert videos into +graphical representations, where video events form the nodes and event-event +relationships (temporal/causal/hierarchical) form the edges. This structured +representation offers many benefits: 1) Interpretable VideoQA via generated +code that parses event-graph; 2) Incorporation of contextual visual information +in the reasoning process (code generation) via event graphs; 3) Robust VideoQA +via Hierarchical Iterative Update of the event graphs. Existing interpretable +VideoQA systems are often top-down, disregarding low-level visual information +in the reasoning plan generation, and are brittle. While bottom-up approaches +produce responses from visual data, they lack interpretability. Experimental +results on NExT-QA, IntentQA, and EgoSchema demonstrate that not only does our +method outperform existing top-down approaches while obtaining competitive +performance against bottom-up approaches, but more importantly, offers superior +interpretability and explainability in the reasoning process. + +
+
+
+
+
+ + ☆ High-Precision Fabric Defect Detection via Adaptive Shape Convolutions + and Large Kernel Spatial Modeling + + +
+ Detecting fabric defects in the textile industry remains a challenging task +due to the diverse and complex nature of defect patterns. Traditional methods +often suffer from slow inference speeds, limited accuracy, and inadequate +recognition rates, particularly in scenarios involving intricate or subtle +defects. To overcome these limitations, we introduce Fab-ASLKS, an advanced +fabric defect detection framework built upon the YOLOv8s architecture. +Fab-ASLKS incorporates two key modules: (1) the Adaptive Shape Convolution +Module (ASCM), which leverages adaptive shape convolution within the Neck to +enhance feature fusion and improve efficiency by extending the capabilities of +the standard C2f structure, and (2) the Large Kernel Shift Convolution Module +(LKSCM), designed to emulate large kernel effects within the Backbone, enabling +superior spatial information extraction. These modules collaboratively optimize +feature extraction and information integration across the network. Extensive +experiments conducted on the Tianchi fabric defect detection dataset +demonstrate that Fab-ASLKS achieves a 5% improvement in mAP@50 over the +baseline, showcasing its capability to deliver high precision and efficiency. + +
+
+ comment: 8 pages, 9 figures +
+
+
+
+
+ + ☆ Post-hoc Spurious Correlation Neutralization with Single-Weight + Fictitious Class Unlearning + + +
+ Neural network training tends to exploit the simplest features as shortcuts +to greedily minimize training loss. However, some of these features might be +spuriously correlated with the target labels, leading to incorrect predictions +by the model. Several methods have been proposed to address this issue. +Focusing on suppressing the spurious correlations with model training, they not +only incur additional training cost, but also have limited practical utility as +the model misbehavior due to spurious relations is usually discovered after its +deployment. It is also often overlooked that spuriousness is a subjective +notion. Hence, the precise questions that must be investigated are; to what +degree a feature is spurious, and how we can proportionally distract the +model's attention from it for reliable prediction. To this end, we propose a +method that enables post-hoc neutralization of spurious feature impact, +controllable to an arbitrary degree. We conceptualize spurious features as +fictitious sub-classes within the original classes, which can be eliminated by +a class removal scheme. We then propose a unique precise class removal +technique that employs a single-weight modification, which entails negligible +performance compromise for the remaining classes. We perform extensive +experiments, demonstrating that by editing just a single weight in a post-hoc +manner, our method achieves highly competitive, or better performance against +the state-of-the-art methods. + +
+
+
+
+
+ + ☆ Dreamweaver: Learning Compositional World Representations from Pixels + + +
+ Humans have an innate ability to decompose their perceptions of the world +into objects and their attributes, such as colors, shapes, and movement +patterns. This cognitive process enables us to imagine novel futures by +recombining familiar concepts. However, replicating this ability in artificial +intelligence systems has proven challenging, particularly when it comes to +modeling videos into compositional concepts and generating unseen, recomposed +futures without relying on auxiliary data, such as text, masks, or bounding +boxes. In this paper, we propose Dreamweaver, a neural architecture designed to +discover hierarchical and compositional representations from raw videos and +generate compositional future simulations. Our approach leverages a novel +Recurrent Block-Slot Unit (RBSU) to decompose videos into their constituent +objects and attributes. In addition, Dreamweaver uses a multi-future-frame +prediction objective to capture disentangled representations for dynamic +concepts more effectively as well as static concepts. In experiments, we +demonstrate our model outperforms current state-of-the-art baselines for world +modeling when evaluated under the DCI framework across multiple datasets. +Furthermore, we show how the modularized concept representations of our model +enable compositional imagination, allowing the generation of novel videos by +recombining attributes from different objects. + +
+
+
+
+
+ + ☆ UltraLightSqueezeNet: A Deep Learning Architecture for Malaria + Classification with up to 54x fewer trainable parameters for resource + constrained devices + + +
+ Lightweight deep learning approaches for malaria detection have gained +attention for their potential to enhance diagnostics in resource constrained +environments. For our study, we selected SqueezeNet1.1 as it is one of the most +popular lightweight architectures. SqueezeNet1.1 is a later version of +SqueezeNet1.0 and is 2.4 times more computationally efficient than the original +model. We proposed and implemented three ultra-lightweight architecture +variants to SqueezeNet1.1 architecture, namely Variant 1 (one fire module), +Variant 2 (two fire modules), and Variant 3 (four fire modules), which are even +more compact than SqueezeNetV1.1 (eight fire modules). These models were +implemented to evaluate the best performing variant that achieves superior +computational efficiency without sacrificing accuracy in malaria blood cell +classification. The models were trained and evaluated using the NIH Malaria +dataset. We assessed each model's performance based on metrics including +accuracy, recall, precision, F1-score, and Area Under the Curve (AUC). The +results show that the SqueezeNet1.1 model achieves the highest performance +across all metrics, with a classification accuracy of 97.12%. Variant 3 (four +fire modules) offers a competitive alternative, delivering almost identical +results (accuracy 96.55%) with a 6x reduction in computational overhead +compared to SqueezeNet1.1. Variant 2 and Variant 1 perform slightly lower than +Variant 3, with Variant 2 (two fire modules) reducing computational overhead by +28x, and Variant 1 (one fire module) achieving a 54x reduction in trainable +parameters compared to SqueezeNet1.1. These findings demonstrate that our +SqueezeNet1.1 architecture variants provide a flexible approach to malaria +detection, enabling the selection of a variant that balances resource +constraints and performance. + +
+
+
+
+
+ + ☆ Fully Guided Neural Schrödinger bridge for Brain MR image synthesis + + +
+ Multi-modal brain MRI provides essential complementary information for +clinical diagnosis. However, acquiring all modalities is often challenging due +to time and cost constraints. To address this, various methods have been +proposed to generate missing modalities from available ones. Traditional +approaches can be broadly categorized into two main types: paired and unpaired +methods. While paired methods offer superior performance, obtaining large-scale +paired datasets is challenging in real-world scenarios. Conversely, unpaired +methods facilitate large-scale data collection but struggle to preserve +critical image features, such as tumors. In this paper, we propose Fully Guided +Schr\"odinger Bridges (FGSB), a novel framework based on Neural Schr\"odinger +Bridges, to overcome these limitations. FGSB achieves stable, high-quality +generation of missing modalities using minimal paired data. Furthermore, when +provided with ground truth or a segmentation network for specific regions, FGSB +can generate missing modalities while preserving these critical areas with +reduced data requirements. Our proposed model consists of two consecutive +phases. 1) Generation Phase: Fuses a generated image, a paired reference image, +and Gaussian noise, employing iterative refinement to mitigate issues such as +mode collapse and improve generation quality 2) Training Phase: Learns the +mapping from the generated image to the target modality. Experiments +demonstrate that FGSB achieves comparable generation performance to methods +trained on large datasets, while using data from only two subjects. Moreover, +the utilization of lesion information with FGSB significantly enhances its +ability to preserve crucial lesion features. + +
+
+ comment: 9 pages,4 figures +
+
+
+
+
+ + ☆ Enhancing Multimodal Entity Linking with Jaccard Distance-based + Conditional Contrastive Learning and Contextual Visual Augmentation + + +
+ Previous research on multimodal entity linking (MEL) has primarily employed +contrastive learning as the primary objective. However, using the rest of the +batch as negative samples without careful consideration, these studies risk +leveraging easy features and potentially overlook essential details that make +entities unique. In this work, we propose JD-CCL (Jaccard Distance-based +Conditional Contrastive Learning), a novel approach designed to enhance the +ability to match multimodal entity linking models. JD-CCL leverages +meta-information to select negative samples with similar attributes, making the +linking task more challenging and robust. Additionally, to address the +limitations caused by the variations within the visual modality among mentions +and entities, we introduce a novel method, CVaCPT (Contextual Visual-aid +Controllable Patch Transform). It enhances visual representations by +incorporating multi-view synthetic images and contextual textual +representations to scale and shift patch representations. Experimental results +on benchmark MEL datasets demonstrate the strong effectiveness of our approach. + +
+
+
+
+
+ + ☆ Advancing MRI Reconstruction: A Systematic Review of Deep Learning and + Compressed Sensing Integration + + +
+ Magnetic resonance imaging (MRI) is a non-invasive imaging modality and +provides comprehensive anatomical and functional insights into the human body. +However, its long acquisition times can lead to patient discomfort, motion +artifacts, and limiting real-time applications. To address these challenges, +strategies such as parallel imaging have been applied, which utilize multiple +receiver coils to speed up the data acquisition process. Additionally, +compressed sensing (CS) is a method that facilitates image reconstruction from +sparse data, significantly reducing image acquisition time by minimizing the +amount of data collection needed. Recently, deep learning (DL) has emerged as a +powerful tool for improving MRI reconstruction. It has been integrated with +parallel imaging and CS principles to achieve faster and more accurate MRI +reconstructions. This review comprehensively examines DL-based techniques for +MRI reconstruction. We categorize and discuss various DL-based methods, +including end-to-end approaches, unrolled optimization, and federated learning, +highlighting their potential benefits. Our systematic review highlights +significant contributions and underscores the potential of DL in MRI +reconstruction. Additionally, we summarize key results and trends in DL-based +MRI reconstruction, including quantitative metrics, the dataset, acceleration +factors, and the progress of and research interest in DL techniques over time. +Finally, we discuss potential future directions and the importance of DL-based +MRI reconstruction in advancing medical imaging. To facilitate further research +in this area, we provide a GitHub repository that includes up-to-date DL-based +MRI reconstruction publications and public +datasets-https://github.com/mosaf/Awesome-DL-based-CS-MRI. + +
+
+
+
+
+ + ☆ Effective Defect Detection Using Instance Segmentation for NDI + + +
+ Ultrasonic testing is a common Non-Destructive Inspection (NDI) method used +in aerospace manufacturing. However, the complexity and size of the ultrasonic +scans make it challenging to identify defects through visual inspection or +machine learning models. Using computer vision techniques to identify defects +from ultrasonic scans is an evolving research area. In this study, we used +instance segmentation to identify the presence of defects in the ultrasonic +scan images of composite panels that are representative of real components +manufactured in aerospace. We used two models based on Mask-RCNN (Detectron 2) +and YOLO 11 respectively. Additionally, we implemented a simple statistical +pre-processing technique that reduces the burden of requiring custom-tailored +pre-processing techniques. Our study demonstrates the feasibility and +effectiveness of using instance segmentation in the NDI pipeline by +significantly reducing data pre-processing time, inspection time, and overall +costs. + +
+
+ comment: 6 pages, 2 figures, 2 tables. Published at AI2ASE 2025 workshop at + AAAI2025. Accepted publication is available at https://ai-2-ase.github.io/ +
+
+
+
+
+ + ☆ SelfPrompt: Confidence-Aware Semi-Supervised Tuning for Robust + Vision-Language Model Adaptation + + +
+ We present SelfPrompt, a novel prompt-tuning approach for vision-language +models (VLMs) in a semi-supervised learning setup. Existing methods for tuning +VLMs in semi-supervised setups struggle with the negative impact of the +miscalibrated VLMs on pseudo-labelling, and the accumulation of noisy +pseudo-labels. SelfPrompt addresses these challenges by introducing a +cluster-guided pseudo-labelling method that improves pseudo-label accuracy, and +a confidence-aware semi-supervised learning module that maximizes the +utilization of unlabelled data by combining supervised learning and +weakly-supervised learning. Additionally, we investigate our method in an +active semi-supervised learning setup, where the labelled set is strategically +selected to ensure the best utilization of a limited labelling budget. To this +end, we propose a weakly-supervised sampling technique that selects a diverse +and representative labelled set, which can be seamlessly integrated into +existing methods to enhance their performance. We conduct extensive evaluations +across 13 datasets, significantly surpassing state-of-the-art performances with +average improvements of 6.23% in standard semi-supervised learning, 6.25% in +active semi-supervised learning, and 4.9% in base-to-novel generalization, +using a 2-shot setup. Furthermore, SelfPrompt shows excellent generalization in +single-shot settings, achieving an average improvement of 11.78%. + +
+
+
+
+
+ + ♻ ☆ Hierarchical Vector Quantization for Unsupervised Action Segmentation + + +
+ In this work, we address unsupervised temporal action segmentation, which +segments a set of long, untrimmed videos into semantically meaningful segments +that are consistent across videos. While recent approaches combine +representation learning and clustering in a single step for this task, they do +not cope with large variations within temporal segments of the same class. To +address this limitation, we propose a novel method, termed Hierarchical Vector +Quantization (HVQ), that consists of two subsequent vector quantization +modules. This results in a hierarchical clustering where the additional +subclusters cover the variations within a cluster. We demonstrate that our +approach captures the distribution of segment lengths much better than the +state of the art. To this end, we introduce a new metric based on the +Jensen-Shannon Distance (JSD) for unsupervised temporal action segmentation. We +evaluate our approach on three public datasets, namely Breakfast, YouTube +Instructional and IKEA ASM. Our approach outperforms the state of the art in +terms of F1 score, recall and JSD. + +
+
+ comment: To be published in Conference on Artificial Intelligence (AAAI) 2025 +
+
+
+
+
+ + ♻ ☆ From One to the Power of Many: Invariance to Multi-LiDAR Perception from + Single-Sensor Datasets + + +
+ Recently, LiDAR segmentation methods for autonomous vehicles, powered by deep +neural networks, have experienced steep growth in performance on classic +benchmarks, such as nuScenes and SemanticKITTI. However, there are still large +gaps in performance when deploying models trained on such single-sensor setups +to modern vehicles with multiple high-resolution LiDAR sensors. In this work, +we introduce a new metric for feature-level invariance which can serve as a +proxy to measure cross-domain generalization without requiring labeled data. +Additionally, we propose two application-specific data augmentations, which +facilitate better transfer to multi-sensor LiDAR setups, when trained on +single-sensor datasets. We provide experimental evidence on both simulated and +real data, that our proposed augmentations improve invariance across LiDAR +setups, leading to improved generalization. + +
+
+ comment: Accepted for publication at the ML4AD Workshop @ AAAI Conference 2025 +
+
+
+
+
+ + ♻ ☆ Token Turing Machines are Efficient Vision Models + + +
+ We propose Vision Token Turing Machines (ViTTM), an efficient, low-latency, +memory-augmented Vision Transformer (ViT). Our approach builds on Neural Turing +Machines and Token Turing Machines, which were applied to NLP and sequential +visual understanding tasks. ViTTMs are designed for non-sequential computer +vision tasks such as image classification and segmentation. Our model creates +two sets of tokens: process tokens and memory tokens; process tokens pass +through encoder blocks and read-write from memory tokens at each encoder block +in the network, allowing them to store and retrieve information from memory. By +ensuring that there are fewer process tokens than memory tokens, we are able to +reduce the inference time of the network while maintaining its accuracy. On +ImageNet-1K, the state-of-the-art ViT-B has median latency of 529.5ms and 81.0% +accuracy, while our ViTTM-B is 56% faster (234.1ms), with 2.4 times fewer +FLOPs, with an accuracy of 82.9%. On ADE20K semantic segmentation, ViT-B +achieves 45.65mIoU at 13.8 frame-per-second (FPS) whereas our ViTTM-B model +acheives a 45.17 mIoU with 26.8 FPS (+94%). + +
+
+ comment: Accepted to WACV 2025 +
+
+
+
+
+ + ♻ ☆ ViPCap: Retrieval Text-Based Visual Prompts for Lightweight Image + Captioning + + +
+ Recent lightweight image captioning models using retrieved data mainly focus +on text prompts. However, previous works only utilize the retrieved text as +text prompts, and the visual information relies only on the CLIP visual +embedding. Because of this issue, there is a limitation that the image +descriptions inherent in the prompt are not sufficiently reflected in the +visual embedding space. To tackle this issue, we propose ViPCap, a novel +retrieval text-based visual prompt for lightweight image captioning. ViPCap +leverages the retrieved text with image information as visual prompts to +enhance the ability of the model to capture relevant visual information. By +mapping text prompts into the CLIP space and generating multiple randomized +Gaussian distributions, our method leverages sampling to explore randomly +augmented distributions and effectively retrieves the semantic features that +contain image information. These retrieved features are integrated into the +image and designated as the visual prompt, leading to performance improvements +on the datasets such as COCO, Flickr30k, and NoCaps. Experimental results +demonstrate that ViPCap significantly outperforms prior lightweight captioning +models in efficiency and effectiveness, demonstrating the potential for a +plug-and-play solution. The source code is available at +https://github.com/taewhankim/VIPCAP. + +
+
+ comment: Accepted to AAAI 2025 +
+
+
+
+
+ + ♻ ☆ An Interpretable X-ray Style Transfer via Trainable Local Laplacian + Filter + + +
+ Radiologists have preferred visual impressions or 'styles' of X-ray images +that are manually adjusted to their needs to support their diagnostic +performance. In this work, we propose an automatic and interpretable X-ray +style transfer by introducing a trainable version of the Local Laplacian Filter +(LLF). From the shape of the LLF's optimized remap function, the +characteristics of the style transfer can be inferred and reliability of the +algorithm can be ensured. Moreover, we enable the LLF to capture complex X-ray +style features by replacing the remap function with a Multi-Layer Perceptron +(MLP) and adding a trainable normalization layer. We demonstrate the +effectiveness of the proposed method by transforming unprocessed mammographic +X-ray images into images that match the style of target mammograms and achieve +a Structural Similarity Index (SSIM) of 0.94 compared to 0.82 of the baseline +LLF style transfer method from Aubry et al. + +
+
+
+
+
+ + ♻ ☆ Bridging the Visual Gap: Fine-Tuning Multimodal Models with + Knowledge-Adapted Captions + + +
+ Recent research increasingly focuses on training vision-language models +(VLMs) with long, detailed image captions. However, small-scale VLMs often +struggle to balance the richness of these captions with the risk of +hallucinating content during fine-tuning. In this paper, we explore how well +VLMs adapt to such captions. To quantify caption quality, we propose Decomposed +NLI (DNLI), an evaluation framework that breaks down generated captions into +individual propositions, assessing each in isolation. This fine-grained +analysis reveals a critical balance between capturing descriptive details and +preventing hallucinations. Our findings show that simply reducing caption +complexity or employing standard data curation techniques does not effectively +resolve this issue. To tackle this challenge, we introduce Knowledge Adapted +(KnowAda) fine-tuning, a data-centric approach that automatically adapts +training data with the model's existing knowledge and visual understanding. +KnowAda minimizes hallucinations while preserving high descriptiveness. We +validate this approach across several small-scale VLMs (up to 7B parameters) +and dense caption datasets, demonstrating that KnowAda effectively balances +hallucination reduction and descriptiveness. Our results show that KnowAda +outperforms various baselines in both automatic metrics and human evaluations. +We will release our code and models. + +
+
+ comment: Accepted to NAACL 2025 +
+
+
+
+
+ + ♻ ☆ Dysca: A Dynamic and Scalable Benchmark for Evaluating Perception + Ability of LVLMs ICLR2025 + + +
+ Currently many benchmarks have been proposed to evaluate the perception +ability of the Large Vision-Language Models (LVLMs). However, most benchmarks +conduct questions by selecting images from existing datasets, resulting in the +potential data leakage. Besides, these benchmarks merely focus on evaluating +LVLMs on the realistic style images and clean scenarios, leaving the +multi-stylized images and noisy scenarios unexplored. In response to these +challenges, we propose a dynamic and scalable benchmark named Dysca for +evaluating LVLMs by leveraging synthesis images. Specifically, we leverage +Stable Diffusion and design a rule-based method to dynamically generate novel +images, questions and the corresponding answers. We consider 51 kinds of image +styles and evaluate the perception capability in 20 subtasks. Moreover, we +conduct evaluations under 4 scenarios (i.e., Clean, Corruption, Print Attacking +and Adversarial Attacking) and 3 question types (i.e., Multi-choices, +True-or-false and Free-form). Thanks to the generative paradigm, Dysca serves +as a scalable benchmark for easily adding new subtasks and scenarios. A total +of 24 advanced open-source LVLMs and 2 close-source LVLMs are evaluated on +Dysca, revealing the drawbacks of current LVLMs. The benchmark is released at +\url{https://github.com/Robin-WZQ/Dysca}. + +
+
+ comment: Accepted by ICLR2025 +
+
+
+
+
+ + ♻ ☆ Heuristic-Free Multi-Teacher Learning + + +
+ We introduce Teacher2Task, a novel framework for multi-teacher learning that +eliminates the need for manual aggregation heuristics. Existing multi-teacher +methods typically rely on such heuristics to combine predictions from multiple +teachers, often resulting in sub-optimal aggregated labels and the propagation +of aggregation errors. Teacher2Task addresses these limitations by introducing +teacher-specific input tokens and reformulating the training process. Instead +of relying on aggregated labels, the framework transforms the training data, +consisting of ground truth labels and annotations from N teachers, into N+1 +distinct tasks: N auxiliary tasks that predict the labeling styles of the N +individual teachers, and one primary task that focuses on the ground truth +labels. This approach, drawing upon principles from multiple learning +paradigms, demonstrates strong empirical results across a range of +architectures, modalities, and tasks. + +
+
+
+
+
+ + ♻ ☆ REP: Resource-Efficient Prompting for Rehearsal-Free Continual Learning + + +
+ Recent rehearsal-free methods, guided by prompts, generally excel in +vision-related continual learning (CL) scenarios with continuously drifting +data. To be deployable on real-world devices, these methods must contain high +resource efficiency during training. In this paper, we introduce +Resource-Efficient Prompting (REP), which targets improving the resource +efficiency of prompt-based rehearsal-free methods. Our key focus is on avoiding +catastrophic trade-offs with accuracy while trimming computational and memory +costs during prompt learning. We achieve this by exploiting swift prompt +selection that enhances input data using a carefully provisioned model, and by +developing adaptive token merging (AToM) and layer dropping (ALD) algorithms +for the prompt updating stage. AToM and ALD perform selective skipping across +the data and model dimensions without compromising task-specific features while +learning new tasks. We validate REP's superior resource efficiency over current +state-of-the-art ViT- and CNN-based methods through extensive experiments on +three image classification datasets. + +
+
+
+
+
+ + ♻ ☆ PhyDeformer: High-Quality Non-Rigid Garment Registration with + Physics-Awareness + + +
+ We present PhyDeformer, a new deformation method for high-quality garment +mesh registration. It operates in two phases: In the first phase, a garment +grading is performed to achieve a coarse 3D alignment between the mesh template +and the target mesh, accounting for proportional scaling and fit (e.g. length, +size). Then, the graded mesh is refined to align with the fine-grained details +of the 3D target through an optimization coupled with the Jacobian-based +deformation framework. Both quantitative and qualitative evaluations on +synthetic and real garments highlight the effectiveness of our method. + +
+
+
+
+
+ + ♻ ☆ Deep Learning Based Segmentation of Blood Vessels from H&E Stained + Oesophageal Adenocarcinoma Whole-Slide Images + + +
+ Blood vessels (BVs) play a critical role in the Tumor Micro-Environment +(TME), potentially influencing cancer progression and treatment response. +However, manually quantifying BVs in Hematoxylin and Eosin (H&E) stained images +is challenging and labor-intensive due to their heterogeneous appearances. We +propose a novel approach of constructing guiding maps to improve the +performance of state-of-the-art segmentation models for BV segmentation, the +guiding maps encourage the models to learn representative features of BVs. This +is particularly beneficial for computational pathology, where labeled training +data is often limited and large models are prone to overfitting. We have +quantitative and qualitative results to demonstrate the efficacy of our +approach in improving segmentation accuracy. In future, we plan to validate +this method to segment BVs across various tissue types and investigate the role +of cellular structures in relation to BVs in the TME. + +
+
+ comment: Accepted by ISBI 2025 +
+
+
+
+
+ + ♻ ☆ An Adaptive Cost-Sensitive Learning and Recursive Denoising Framework + for Imbalanced SVM Classification + + +
+ Category imbalance is one of the most popular and important issues in the +domain of classification. Emotion classification model trained on imbalanced +datasets easily leads to unreliable prediction. The traditional machine +learning method tends to favor the majority class, which leads to the lack of +minority class information in the model. Moreover, most existing models will +produce abnormal sensitivity issues or performance degradation. We propose a +robust learning algorithm based on adaptive cost-sensitivity and recursive +denoising, which is a generalized framework and can be incorporated into most +stochastic optimization algorithms. The proposed method uses the dynamic kernel +distance optimization model between the sample and the decision boundary, which +makes full use of the sample's prior information. In addition, we also put +forward an effective method to filter noise, the main idea of which is to judge +the noise by finding the nearest neighbors of the minority class. In order to +evaluate the strength of the proposed method, we not only carry out experiments +on standard datasets but also apply it to emotional classification problems +with different imbalance rates (IR). Experimental results show that the +proposed general framework is superior to traditional methods in Accuracy, +G-mean, Recall and F1-score. + +
+
+ comment: 22 pages, 41 figures +
+
+
+
+
+ + ♻ ☆ One-Prompt-One-Story: Free-Lunch Consistent Text-to-Image Generation + Using a Single Prompt ICLR2025 + + +
+ Text-to-image generation models can create high-quality images from input +prompts. However, they struggle to support the consistent generation of +identity-preserving requirements for storytelling. Existing approaches to this +problem typically require extensive training in large datasets or additional +modifications to the original model architectures. This limits their +applicability across different domains and diverse diffusion model +configurations. In this paper, we first observe the inherent capability of +language models, coined context consistency, to comprehend identity through +context with a single prompt. Drawing inspiration from the inherent context +consistency, we propose a novel training-free method for consistent +text-to-image (T2I) generation, termed "One-Prompt-One-Story" (1Prompt1Story). +Our approach 1Prompt1Story concatenates all prompts into a single input for T2I +diffusion models, initially preserving character identities. We then refine the +generation process using two novel techniques: Singular-Value Reweighting and +Identity-Preserving Cross-Attention, ensuring better alignment with the input +description for each frame. In our experiments, we compare our method against +various existing consistent T2I generation approaches to demonstrate its +effectiveness through quantitative metrics and qualitative assessments. Code is +available at https://github.com/byliutao/1Prompt1Story. + +
+
+ comment: 28 pages, 22 figures, ICLR2025 conference +
+
+
+
+
+ + ♻ ☆ A New Cross-Space Total Variation Regularization Model for Color Image + Restoration with Quaternion Blur Operator + + +
+ The cross-channel deblurring problem in color image processing is difficult +to solve due to the complex coupling and structural blurring of color pixels. +Until now, there are few efficient algorithms that can reduce color artifacts +in deblurring process. To solve this challenging problem, we present a novel +cross-space total variation (CSTV) regularization model for color image +deblurring by introducing a quaternion blur operator and a cross-color space +regularization functional. The existence and uniqueness of the solution is +proved and a new L-curve method is proposed to find a balance of regularization +terms on different color spaces. The Euler-Lagrange equation is derived to show +that CSTV has taken into account the coupling of all color channels and the +local smoothing within each color channel. A quaternion operator splitting +method is firstly proposed to enhance the ability of color artifacts reduction +of the CSTV regularization model. This strategy also applies to the well-known +color deblurring models. Numerical experiments on color image databases +illustrate the efficiency and effectiveness of the new model and algorithms. +The color images restored by them successfully maintain the color and spatial +information and are of higher quality in terms of PSNR, SSIM, MSE and CIEde2000 +than the restorations of the-state-of-the-art methods. + +
+
+ comment: 15pages,14figures +
+
+
+
+
+ + ♻ ☆ S3PT: Scene Semantics and Structure Guided Clustering to Boost + Self-Supervised Pre-Training for Autonomous Driving + + +
+ Recent self-supervised clustering-based pre-training techniques like DINO and +Cribo have shown impressive results for downstream detection and segmentation +tasks. However, real-world applications such as autonomous driving face +challenges with imbalanced object class and size distributions and complex +scene geometries. In this paper, we propose S3PT a novel scene semantics and +structure guided clustering to provide more scene-consistent objectives for +self-supervised training. Specifically, our contributions are threefold: First, +we incorporate semantic distribution consistent clustering to encourage better +representation of rare classes such as motorcycles or animals. Second, we +introduce object diversity consistent spatial clustering, to handle imbalanced +and diverse object sizes, ranging from large background areas to small objects +such as pedestrians and traffic signs. Third, we propose a depth-guided spatial +clustering to regularize learning based on geometric information of the scene, +thus further refining region separation on the feature level. Our learned +representations significantly improve performance in downstream semantic +segmentation and 3D object detection tasks on the nuScenes, nuImages, and +Cityscapes datasets and show promising domain translation properties. + +
+
+ comment: Accepted for WACV 2025 (Oral) +
+
+
+
+
+ + ♻ ☆ ASCNet: Asymmetric Sampling Correction Network for Infrared Image + Destriping + + +
+ In a real-world infrared imaging system, effectively learning a consistent +stripe noise removal model is essential. Most existing destriping methods +cannot precisely reconstruct images due to cross-level semantic gaps and +insufficient characterization of the global column features. To tackle this +problem, we propose a novel infrared image destriping method, called Asymmetric +Sampling Correction Network (ASCNet), that can effectively capture global +column relationships and embed them into a U-shaped framework, providing +comprehensive discriminative representation and seamless semantic connectivity. +Our ASCNet consists of three core elements: Residual Haar Discrete Wavelet +Transform (RHDWT), Pixel Shuffle (PS), and Column Non-uniformity Correction +Module (CNCM). Specifically, RHDWT is a novel downsampler that employs +double-branch modeling to effectively integrate stripe-directional prior +knowledge and data-driven semantic interaction to enrich the feature +representation. Observing the semantic patterns crosstalk of stripe noise, PS +is introduced as an upsampler to prevent excessive apriori decoding and +performing semantic-bias-free image reconstruction. After each sampling, CNCM +captures the column relationships in long-range dependencies. By incorporating +column, spatial, and self-dependence information, CNCM well establishes a +global context to distinguish stripes from the scene's vertical structures. +Extensive experiments on synthetic data, real data, and infrared small target +detection tasks demonstrate that the proposed method outperforms +state-of-the-art single-image destriping methods both visually and +quantitatively. Our code will be made publicly available at +https://github.com/xdFai/ASCNet. + +
+
+
+
+
+ + ♻ ☆ HeightLane: BEV Heightmap guided 3D Lane Detection + + +
+ Accurate 3D lane detection from monocular images presents significant +challenges due to depth ambiguity and imperfect ground modeling. Previous +attempts to model the ground have often used a planar ground assumption with +limited degrees of freedom, making them unsuitable for complex road +environments with varying slopes. Our study introduces HeightLane, an +innovative method that predicts a height map from monocular images by creating +anchors based on a multi-slope assumption. This approach provides a detailed +and accurate representation of the ground. HeightLane employs the predicted +heightmap along with a deformable attention-based spatial feature transform +framework to efficiently convert 2D image features into 3D bird's eye view +(BEV) features, enhancing spatial understanding and lane structure recognition. +Additionally, the heightmap is used for the positional encoding of BEV +features, further improving their spatial accuracy. This explicit view +transformation bridges the gap between front-view perceptions and spatially +accurate BEV representations, significantly improving detection performance. To +address the lack of the necessary ground truth (GT) height map in the original +OpenLane dataset, we leverage the Waymo dataset and accumulate its LiDAR data +to generate a height map for the drivable area of each scene. The GT heightmaps +are used to train the heightmap extraction module from monocular images. +Extensive experiments on the OpenLane validation set show that HeightLane +achieves state-of-the-art performance in terms of F-score, highlighting its +potential in real-world applications. + +
+
+ comment: 10 pages, 6 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Exposure Bracketing Is All You Need For A High-Quality Image ICLR 2025 + + +
+ It is highly desired but challenging to acquire high-quality photos with +clear content in low-light environments. Although multi-image processing +methods (using burst, dual-exposure, or multi-exposure images) have made +significant progress in addressing this issue, they typically focus on specific +restoration or enhancement problems, and do not fully explore the potential of +utilizing multiple images. Motivated by the fact that multi-exposure images are +complementary in denoising, deblurring, high dynamic range imaging, and +super-resolution, we propose to utilize exposure bracketing photography to get +a high-quality image by combining these tasks in this work. Due to the +difficulty in collecting real-world pairs, we suggest a solution that first +pre-trains the model with synthetic paired data and then adapts it to +real-world unlabeled images. In particular, a temporally modulated recurrent +network (TMRNet) and self-supervised adaptation method are proposed. Moreover, +we construct a data simulation pipeline to synthesize pairs and collect +real-world images from 200 nighttime scenarios. Experiments on both datasets +show that our method performs favorably against the state-of-the-art +multi-image processing ones. Code and datasets are available at +https://github.com/cszhilu1998/BracketIRE. + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Teacher Encoder-Student Decoder Denoising Guided Segmentation Network + for Anomaly Detection + + +
+ Visual anomaly detection is a highly challenging task, often categorized as a +one-class classification and segmentation problem. Recent studies have +demonstrated that the student-teacher (S-T) framework effectively addresses +this challenge. However, most S-T frameworks rely solely on pre-trained teacher +networks to guide student networks in learning multi-scale similar features, +overlooking the potential of the student networks to enhance learning through +multi-scale feature fusion. In this study, we propose a novel model named +PFADSeg, which integrates a pre-trained teacher network, a denoising student +network with multi-scale feature fusion, and a guided anomaly segmentation +network into a unified framework. By adopting a unique teacher-encoder and +student-decoder denoising mode, the model improves the student network's +ability to learn from teacher network features. Furthermore, an adaptive +feature fusion mechanism is introduced to train a self-supervised segmentation +network that synthesizes anomaly masks autonomously, significantly increasing +detection performance. Evaluated on the MVTec AD dataset, PFADSeg achieves +state-of-the-art results with an image-level AUC of 98.9%, a pixel-level mean +precision of 76.4%, and an instance-level mean precision of 78.7%. + +
+
+
+
+
+ + ♻ ☆ Multi-aspect Knowledge Distillation with Large Language Model + + +
+ Recent advancements in deep learning have significantly improved performance +on computer vision tasks. Previous image classification methods primarily +modify model architectures or add features, and they optimize models using +cross-entropy loss on class logits. Since they focus on classifying images with +considering class labels, these methods may struggle to learn various +\emph{aspects} of classes (e.g., natural positions and shape changes). +Rethinking the previous approach from a novel view, we propose a multi-aspect +knowledge distillation method using Multimodal Large Language Models (MLLMs). +Our approach involves: 1) querying Large Language Model with multi-aspect +questions relevant to the knowledge we want to transfer to the model, 2) +extracting corresponding logits from MLLM, and 3) expanding the model's output +dimensions to distill these multi-aspect logits. We then apply cross-entropy +loss to class logits and binary cross-entropy loss to multi-aspect logits. +Through our method, the model can learn not only the knowledge about visual +aspects but also the abstract and complex aspects that require a deeper +understanding. We primarily apply our method to image classification, and to +explore the potential for extending our model, we expand it to other tasks, +such as object detection. In all experimental results, our method improves the +performance of the baselines. Additionally, we analyze the effect of +multi-aspect knowledge distillation. These results demonstrate that our method +can transfer knowledge about various aspects to the model and the aspect +knowledge can enhance model performance in computer vision tasks. This paper +demonstrates the great potential of multi-aspect knowledge distillation, and we +believe it offers a promising direction for future research in computer vision +and beyond. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ COLT: Cyclic Overlapping Lottery Tickets for Faster Pruning of + Convolutional Neural Networks + + +
+ Pruning refers to the elimination of trivial weights from neural networks. +The sub-networks within an overparameterized model produced after pruning are +often called Lottery tickets. This research aims to generate winning lottery +tickets from a set of lottery tickets that can achieve similar accuracy to the +original unpruned network. We introduce a novel winning ticket called Cyclic +Overlapping Lottery Ticket (COLT) by data splitting and cyclic retraining of +the pruned network from scratch. We apply a cyclic pruning algorithm that keeps +only the overlapping weights of different pruned models trained on different +data segments. Our results demonstrate that COLT can achieve similar accuracies +(obtained by the unpruned model) while maintaining high sparsities. We show +that the accuracy of COLT is on par with the winning tickets of Lottery Ticket +Hypothesis (LTH) and, at times, is better. Moreover, COLTs can be generated +using fewer iterations than tickets generated by the popular Iterative +Magnitude Pruning (IMP) method. In addition, we also notice COLTs generated on +large datasets can be transferred to small ones without compromising +performance, demonstrating its generalizing capability. We conduct all our +experiments on Cifar-10, Cifar-100 & TinyImageNet datasets and report superior +performance than the state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ ComPC: Completing a 3D Point Cloud with 2D Diffusion Priors ICLR 2025 + + +
+ 3D point clouds directly collected from objects through sensors are often +incomplete due to self-occlusion. Conventional methods for completing these +partial point clouds rely on manually organized training sets and are usually +limited to object categories seen during training. In this work, we propose a +test-time framework for completing partial point clouds across unseen +categories without any requirement for training. Leveraging point rendering via +Gaussian Splatting, we develop techniques of Partial Gaussian Initialization, +Zero-shot Fractal Completion, and Point Cloud Extraction that utilize priors +from pre-trained 2D diffusion models to infer missing regions and extract +uniform completed point clouds. Experimental results on both synthetic and +real-world scanned point clouds demonstrate that our approach outperforms +existing methods in completing a variety of objects. Our project page is at +\url{https://tianxinhuang.github.io/projects/ComPC/}. + +
+
+ comment: Accepted by ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Enhanced Encoder-Decoder Architecture for Accurate Monocular Depth + Estimation + + +
+ Estimating depth from a single 2D image is a challenging task due to the lack +of stereo or multi-view data, which are typically required for depth +perception. In state-of-the-art architectures, the main challenge is to +efficiently capture complex objects and fine-grained details, which are often +difficult to predict. This paper introduces a novel deep learning-based +approach using an enhanced encoder-decoder architecture, where the +Inception-ResNet-v2 model serves as the encoder. This is the first instance of +utilizing Inception-ResNet-v2 as an encoder for monocular depth estimation, +demonstrating improved performance over previous models. It incorporates +multi-scale feature extraction to enhance depth prediction accuracy across +various object sizes and distances. We propose a composite loss function +comprising depth loss, gradient edge loss, and Structural Similarity Index +Measure (SSIM) loss, with fine-tuned weights to optimize the weighted sum, +ensuring a balance across different aspects of depth estimation. Experimental +results on the KITTI dataset show that our model achieves a significantly +faster inference time of 0.019 seconds, outperforming vision transformers in +efficiency while maintaining good accuracy. On the NYU Depth V2 dataset, the +model establishes state-of-the-art performance, with an Absolute Relative Error +(ARE) of 0.064, a Root Mean Square Error (RMSE) of 0.228, and an accuracy of +89.3% for $\delta$ < 1.25. These metrics demonstrate that our model can +accurately and efficiently predict depth even in challenging scenarios, +providing a practical solution for real-time applications. + +
+
+
+
+
+ + ♻ ☆ ControlAR: Controllable Image Generation with Autoregressive Models ICLR 2025 + + +
+ Autoregressive (AR) models have reformulated image generation as next-token +prediction, demonstrating remarkable potential and emerging as strong +competitors to diffusion models. However, control-to-image generation, akin to +ControlNet, remains largely unexplored within AR models. Although a natural +approach, inspired by advancements in Large Language Models, is to tokenize +control images into tokens and prefill them into the autoregressive model +before decoding image tokens, it still falls short in generation quality +compared to ControlNet and suffers from inefficiency. To this end, we introduce +ControlAR, an efficient and effective framework for integrating spatial +controls into autoregressive image generation models. Firstly, we explore +control encoding for AR models and propose a lightweight control encoder to +transform spatial inputs (e.g., canny edges or depth maps) into control tokens. +Then ControlAR exploits the conditional decoding method to generate the next +image token conditioned on the per-token fusion between control and image +tokens, similar to positional encodings. Compared to prefilling tokens, using +conditional decoding significantly strengthens the control capability of AR +models but also maintains the model's efficiency. Furthermore, the proposed +ControlAR surprisingly empowers AR models with arbitrary-resolution image +generation via conditional decoding and specific controls. Extensive +experiments can demonstrate the controllability of the proposed ControlAR for +the autoregressive control-to-image generation across diverse inputs, including +edges, depths, and segmentation masks. Furthermore, both quantitative and +qualitative results indicate that ControlAR surpasses previous state-of-the-art +controllable diffusion models, e.g., ControlNet++. Code, models, and demo will +soon be available at https://github.com/hustvl/ControlAR. + +
+
+ comment: To appear in ICLR 2025. Work in progress +
+
+
+
+
+ + ♻ ☆ In-Situ Fine-Tuning of Wildlife Models in IoT-Enabled Camera Traps for + Efficient Adaptation + + +
+ Resource-constrained IoT devices increasingly rely on deep learning models +for inference tasks in remote environments. However, these models experience +significant accuracy drops due to domain shifts when encountering variations in +lighting, weather, and seasonal conditions. While cloud-based retraining can +address this issue, many IoT deployments operate with limited connectivity and +energy constraints, making traditional fine-tuning approaches impractical. We +explore this challenge through the lens of wildlife ecology, where camera traps +must maintain accurate species classification across changing seasons, weather, +and habitats without reliable connectivity. We introduce WildFit, an autonomous +in-situ adaptation framework that leverages the key insight that background +scenes change more frequently than the visual characteristics of monitored +species. WildFit combines background-aware synthesis to generate training +samples on-device with drift-aware fine-tuning that triggers model updates only +when necessary to conserve resources. Through extensive evaluation on multiple +camera trap deployments, we demonstrate that WildFit significantly improves +accuracy while greatly reducing adaptation overhead compared to traditional +approaches. + +
+
+
+
+
+ + ♻ ☆ Tarsier2: Advancing Large Vision-Language Models from Detailed Video + Description to Comprehensive Video Understanding + + +
+ We introduce Tarsier2, a state-of-the-art large vision-language model (LVLM) +designed for generating detailed and accurate video descriptions, while also +exhibiting superior general video understanding capabilities. Tarsier2 achieves +significant advancements through three key upgrades: (1) Scaling pre-training +data from 11M to 40M video-text pairs, enriching both volume and diversity; (2) +Performing fine-grained temporal alignment during supervised fine-tuning; (3) +Using model-based sampling to automatically construct preference data and +applying DPO training for optimization. Extensive experiments show that +Tarsier2-7B consistently outperforms leading proprietary models, including +GPT-4o and Gemini 1.5 Pro, in detailed video description tasks. On the DREAM-1K +benchmark, Tarsier2-7B improves F1 by 2.8% over GPT-4o and 5.8% over +Gemini-1.5-Pro. In human side-by-side evaluations, Tarsier2-7B shows a +8.6% +performance advantage over GPT-4o and +24.9% over Gemini-1.5-Pro. Tarsier2-7B +also sets new state-of-the-art results across 15 public benchmarks, spanning +tasks such as video question-answering, video grounding, hallucination test, +and embodied question-answering, demonstrating its versatility as a robust +generalist vision-language model. + +
+
+
+
+
+ + ♻ ☆ PreMix: Addressing Label Scarcity in Whole Slide Image Classification + with Pre-trained Multiple Instance Learning Aggregators + + +
+ Multiple instance learning (MIL) has emerged as a powerful framework for +weakly supervised whole slide image (WSI) classification, enabling slide-level +predictions without requiring detailed patch-level annotations. However, a key +limitation of MIL lies in the underexplored potential of pre-training the MIL +aggregator. Most existing approaches train it from scratch, resulting in +performance heavily dependent on the number of labeled WSIs, while overlooking +the abundance of unlabeled WSIs available in real-world scenarios. To address +this, we propose PreMix, a novel framework that leverages a non-contrastive +pre-training method, Barlow Twins, augmented with the Slide Mixing approach to +generate additional positive pairs and enhance feature learning, particularly +under limited labeled WSI conditions. Fine-tuning with Mixup and Manifold Mixup +further enhances robustness by effectively handling the diverse sizes of +gigapixel WSIs. Experimental results demonstrate that integrating HIPT into +PreMix achieves an average F1 improvement of 4.7% over the baseline HIPT across +various WSI training datasets and label sizes. These findings underscore its +potential to advance WSI classification with limited labeled data and its +applicability to real-world histopathology practices. The code is available at +https://anonymous.4open.science/r/PreMix + +
+
+ comment: Under review for the Biomedical Signal Processing and Control journal +
+
+
+
+
+ + ♻ ☆ Interpretable Face Anti-Spoofing: Enhancing Generalization with + Multimodal Large Language Models + + +
+ Face Anti-Spoofing (FAS) is essential for ensuring the security and +reliability of facial recognition systems. Most existing FAS methods are +formulated as binary classification tasks, providing confidence scores without +interpretation. They exhibit limited generalization in out-of-domain scenarios, +such as new environments or unseen spoofing types. In this work, we introduce a +multimodal large language model (MLLM) framework for FAS, termed Interpretable +Face Anti-Spoofing (I-FAS), which transforms the FAS task into an interpretable +visual question answering (VQA) paradigm. Specifically, we propose a +Spoof-aware Captioning and Filtering (SCF) strategy to generate high-quality +captions for FAS images, enriching the model's supervision with natural +language interpretations. To mitigate the impact of noisy captions during +training, we develop a Lopsided Language Model (L-LM) loss function that +separates loss calculations for judgment and interpretation, prioritizing the +optimization of the former. Furthermore, to enhance the model's perception of +global visual features, we design a Globally Aware Connector (GAC) to align +multi-level visual representations with the language model. Extensive +experiments on standard and newly devised One to Eleven cross-domain +benchmarks, comprising 12 public datasets, demonstrate that our method +significantly outperforms state-of-the-art methods. + +
+
+ comment: Accepted to AAAI2025(Oral) +
+
+
+
+
+ + ♻ ☆ Skip-WaveNet: A Wavelet based Multi-scale Architecture to Trace Snow + Layers in Radar Echograms + + +
+ Airborne radar sensors capture the profile of snow layers present on top of +an ice sheet. Accurate tracking of these layers is essential to calculate their +thicknesses, which are required to investigate the contribution of polar ice +cap melt to sea-level rise. However, automatically processing the radar +echograms to detect the underlying snow layers is a challenging problem. In our +work, we develop wavelet-based multi-scale deep learning architectures for +these radar echograms to improve snow layer detection. These architectures +estimate the layer depths with a mean absolute error of 3.31 pixels and 94.3% +average precision, achieving higher generalizability as compared to +state-of-the-art snow layer detection networks. These depth estimates also +agree well with physically drilled stake measurements. Such robust +architectures can be used on echograms from future missions to efficiently +trace snow layers, estimate their individual thicknesses and thus support +sea-level rise projection models. + +
+
+
+
+
+ + ♻ ☆ LoFi: Vision-Aided Label Generator for Wi-Fi Localization and Tracking + + +
+ Data-driven Wi-Fi localization and tracking have shown great promise due to +their lower reliance on specialized hardware compared to model-based methods. +However, most existing data collection techniques provide only coarse-grained +ground truth or a limited number of labeled points, significantly hindering the +advancement of data-driven approaches. While systems like lidar can deliver +precise ground truth, their high costs make them inaccessible to many users. To +address these challenges, we propose LoFi, a vision-aided label generator for +Wi-Fi localization and tracking. LoFi can generate ground truth position +coordinates solely from 2D images, offering high precision, low cost, and ease +of use. Utilizing our method, we have compiled a Wi-Fi tracking and +localization dataset using the ESP32-S3 and a webcam, which will be +open-sourced along with the code upon publication. + +
+
+
+
+
+ + ♻ ☆ Adaptive Retention & Correction: Test-Time Training for Continual + Learning ICLR 2025 + + +
+ Continual learning, also known as lifelong learning or incremental learning, +refers to the process by which a model learns from a stream of incoming data +over time. A common problem in continual learning is the classification layer's +bias towards the most recent task. Traditionally, methods have relied on +incorporating data from past tasks during training to mitigate this issue. +However, the recent shift in continual learning to memory-free environments has +rendered these approaches infeasible. In this study, we propose a solution +focused on the testing phase. We first introduce a simple Out-of-Task Detection +method, OTD, designed to accurately identify samples from past tasks during +testing. Leveraging OTD, we then propose: (1) an Adaptive Retention mechanism +for dynamically tuning the classifier layer on past task data; (2) an Adaptive +Correction mechanism for revising predictions when the model classifies data +from previous tasks into classes from the current task. We name our approach +Adaptive Retention & Correction (ARC). While designed for memory-free +environments, ARC also proves effective in memory-based settings. Extensive +experiments show that our proposed method can be plugged in to virtually any +existing continual learning approach without requiring any modifications to its +training procedure. Specifically, when integrated with state-of-the-art +approaches, ARC achieves an average performance increase of 2.7% and 2.6% on +the CIFAR-100 and Imagenet-R datasets, respectively. + +
+
+ comment: Accepted to ICLR 2025 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 31 + +
+
+
+ + ☆ Fast3R: Towards 3D Reconstruction of 1000+ Images in One Forward Pass + + +
+ Multi-view 3D reconstruction remains a core challenge in computer vision, +particularly in applications requiring accurate and scalable representations +across diverse perspectives. Current leading methods such as DUSt3R employ a +fundamentally pairwise approach, processing images in pairs and necessitating +costly global alignment procedures to reconstruct from multiple views. In this +work, we propose Fast 3D Reconstruction (Fast3R), a novel multi-view +generalization to DUSt3R that achieves efficient and scalable 3D reconstruction +by processing many views in parallel. Fast3R's Transformer-based architecture +forwards N images in a single forward pass, bypassing the need for iterative +alignment. Through extensive experiments on camera pose estimation and 3D +reconstruction, Fast3R demonstrates state-of-the-art performance, with +significant improvements in inference speed and reduced error accumulation. +These results establish Fast3R as a robust alternative for multi-view +applications, offering enhanced scalability without compromising reconstruction +accuracy. + +
+
+ comment: Project website: https://fast3r-3d.github.io/ +
+
+
+
+
+ + ☆ Temporal Preference Optimization for Long-Form Video Understanding + + +
+ Despite significant advancements in video large multimodal models +(video-LMMs), achieving effective temporal grounding in long-form videos +remains a challenge for existing models. To address this limitation, we propose +Temporal Preference Optimization (TPO), a novel post-training framework +designed to enhance the temporal grounding capabilities of video-LMMs through +preference learning. TPO adopts a self-training approach that enables models to +differentiate between well-grounded and less accurate temporal responses by +leveraging curated preference datasets at two granularities: localized temporal +grounding, which focuses on specific video segments, and comprehensive temporal +grounding, which captures extended temporal dependencies across entire video +sequences. By optimizing on these preference datasets, TPO significantly +enhances temporal understanding while reducing reliance on manually annotated +data. Extensive experiments on three long-form video understanding +benchmarks--LongVideoBench, MLVU, and Video-MME--demonstrate the effectiveness +of TPO across two state-of-the-art video-LMMs. Notably, LLaVA-Video-TPO +establishes itself as the leading 7B model on the Video-MME benchmark, +underscoring the potential of TPO as a scalable and efficient solution for +advancing temporal reasoning in long-form video understanding. Project page: +https://ruili33.github.io/tpo_website. + +
+
+
+
+
+ + FAST-LIVO2 on Resource-Constrained Platforms: LiDAR-Inertial-Visual + Odometry with Efficient Memory and Computation + + +
+ This paper presents a lightweight LiDAR-inertial-visual odometry system +optimized for resource-constrained platforms. It integrates a +degeneration-aware adaptive visual frame selector into error-state iterated +Kalman filter (ESIKF) with sequential updates, improving computation efficiency +significantly while maintaining a similar level of robustness. Additionally, a +memory-efficient mapping structure combining a locally unified visual-LiDAR map +and a long-term visual map achieves a good trade-off between performance and +memory usage. Extensive experiments on x86 and ARM platforms demonstrate the +system's robustness and efficiency. On the Hilti dataset, our system achieves a +33% reduction in per-frame runtime and 47% lower memory usage compared to +FAST-LIVO2, with only a 3 cm increase in RMSE. Despite this slight accuracy +trade-off, our system remains competitive, outperforming state-of-the-art +(SOTA) LIO methods such as FAST-LIO2 and most existing LIVO systems. These +results validate the system's capability for scalable deployment on +resource-constrained edge computing platforms. + +
+
+
+
+
+ + ☆ First Lessons Learned of an Artificial Intelligence Robotic System for + Autonomous Coarse Waste Recycling Using Multispectral Imaging-Based Methods + + +
+ Current disposal facilities for coarse-grained waste perform manual sorting +of materials with heavy machinery. Large quantities of recyclable materials are +lost to coarse waste, so more effective sorting processes must be developed to +recover them. Two key aspects to automate the sorting process are object +detection with material classification in mixed piles of waste, and autonomous +control of hydraulic machinery. Because most objects in those accumulations of +waste are damaged or destroyed, object detection alone is not feasible in the +majority of cases. To address these challenges, we propose a classification of +materials with multispectral images of ultraviolet (UV), visual (VIS), near +infrared (NIR), and short-wave infrared (SWIR) spectrums. Solution for +autonomous control of hydraulic heavy machines for sorting of bulky waste is +being investigated using cost-effective cameras and artificial +intelligence-based controllers. + +
+
+ comment: Published in Proceedings of Sardinia 2023, 19th International + Symposium on Waste Management, Resource Recovery and Sustainable Landfilling +
+
+
+
+
+ + ☆ Temporal Logic Guided Safe Navigation for Autonomous Vehicles + + +
+ Safety verification for autonomous vehicles (AVs) and ground robots is +crucial for ensuring reliable operation given their uncertain environments. +Formal language tools provide a robust and sound method to verify safety rules +for such complex cyber-physical systems. In this paper, we propose a hybrid +approach that combines the strengths of formal verification languages like +Linear Temporal Logic (LTL) and Signal Temporal Logic (STL) to generate safe +trajectories and optimal control inputs for autonomous vehicle navigation. We +implement a symbolic path planning approach using LTL to generate a formally +safe reference trajectory. A mixed integer linear programming (MILP) solver is +then used on this reference trajectory to solve for the control inputs while +satisfying the state, control and safety constraints described by STL. We test +our proposed solution on two environments and compare the results with popular +path planning algorithms. In contrast to conventional path planning algorithms, +our formally safe solution excels in handling complex specification scenarios +while ensuring both safety and comparable computation times. + +
+
+ comment: 6 pages, 5 figures, Modelling Estimation and Controls Conference-2024 +
+
+
+
+
+ + ☆ Towards Real-World Validation of a Physics-Based Ship Motion Prediction + Model + + +
+ The maritime industry aims towards a sustainable future, which requires +significant improvements in operational efficiency. Current approaches focus on +minimising fuel consumption and emissions through greater autonomy. Efficient +and safe autonomous navigation requires high-fidelity ship motion models +applicable to real-world conditions. Although physics-based ship motion models +can predict ships' motion with sub-second resolution, their validation in +real-world conditions is rarely found in the literature. This study presents a +physics-based 3D dynamics motion model that is tailored to a container-ship, +and compares its predictions against real-world voyages. The model integrates +vessel motion over time and accounts for its hydrodynamic behavior under +different environmental conditions. The model's predictions are evaluated +against real vessel data both visually and using multiple distance measures. +Both methodologies demonstrate that the model's predictions align closely with +the real-world trajectories of the container-ship. + +
+
+
+
+
+ + ☆ You Only Crash Once v2: Perceptually Consistent Strong Features for + One-Stage Domain Adaptive Detection of Space Terrain + + +
+ The in-situ detection of planetary, lunar, and small-body surface terrain is +crucial for autonomous spacecraft applications, where learning-based computer +vision methods are increasingly employed to enable intelligence without prior +information or human intervention. However, many of these methods remain +computationally expensive for spacecraft processors and prevent real-time +operation. Training of such algorithms is additionally complex due to the +scarcity of labeled data and reliance on supervised learning approaches. +Unsupervised Domain Adaptation (UDA) offers a promising solution by +facilitating model training with disparate data sources such as simulations or +synthetic scenes, although UDA is difficult to apply to celestial environments +where challenging feature spaces are paramount. To alleviate such issues, You +Only Crash Once (YOCOv1) has studied the integration of Visual Similarity-based +Alignment (VSA) into lightweight one-stage object detection architectures to +improve space terrain UDA. Although proven effective, the approach faces +notable limitations, including performance degradations in multi-class and +high-altitude scenarios. Building upon the foundation of YOCOv1, we propose +novel additions to the VSA scheme that enhance terrain detection capabilities +under UDA, and our approach is evaluated across both simulated and real-world +data. Our second YOCO rendition, YOCOv2, is capable of achieving +state-of-the-art UDA performance on surface terrain detection, where we +showcase improvements upwards of 31% compared with YOCOv1 and terrestrial +state-of-the-art. We demonstrate the practical utility of YOCOv2 with +spacecraft flight hardware performance benchmarking and qualitative evaluation +of NASA mission data. + +
+
+
+
+
+ + ☆ The Road to Learning Explainable Inverse Kinematic Models: Graph Neural + Networks as Inductive Bias for Symbolic Regression + + +
+ This paper shows how a Graph Neural Network (GNN) can be used to learn an +Inverse Kinematics (IK) based on an automatically generated dataset. The +generated Inverse Kinematics is generalized to a family of manipulators with +the same Degree of Freedom (DOF), but varying link length configurations. The +results indicate a position error of less than 1.0 cm for 3 DOF and 4.5 cm for +5 DOF, and orientation error of 2$^\circ$ for 3 DOF and 8.2$^\circ$ for 6 DOF, +which allows the deployment to certain real world-problems. However, +out-of-domain errors and lack of extrapolation can be observed in the resulting +GNN. An extensive analysis of these errors indicates potential for enhancement +in the future. Consequently, the generated GNNs are tailored to be used in +future work as an inductive bias to generate analytical equations through +symbolic regression. + +
+
+
+
+
+ + ☆ Iterative Shaping of Multi-Particle Aggregates based on Action Trees and + VLM + + +
+ In this paper, we address the problem of manipulating multi-particle +aggregates using a bimanual robotic system. Our approach enables the autonomous +transport of dispersed particles through a series of shaping and pushing +actions using robotically-controlled tools. Achieving this advanced +manipulation capability presents two key challenges: high-level task planning +and trajectory execution. For task planning, we leverage Vision Language Models +(VLMs) to enable primitive actions such as tool affordance grasping and +non-prehensile particle pushing. For trajectory execution, we represent the +evolving particle aggregate's contour using truncated Fourier series, providing +efficient parametrization of its closed shape. We adaptively compute trajectory +waypoints based on group cohesion and the geometric centroid of the aggregate, +accounting for its spatial distribution and collective motion. Through +real-world experiments, we demonstrate the effectiveness of our methodology in +actively shaping and manipulating multi-particle aggregates while maintaining +high system cohesion. + +
+
+
+
+
+ + ☆ Knowledge-Informed Multi-Agent Trajectory Prediction at Signalized + Intersections for Infrastructure-to-Everything + + +
+ Multi-agent trajectory prediction at signalized intersections is crucial for +developing efficient intelligent transportation systems and safe autonomous +driving systems. Due to the complexity of intersection scenarios and the +limitations of single-vehicle perception, the performance of vehicle-centric +prediction methods has reached a plateau. Furthermore, most works underutilize +critical intersection information, including traffic signals, and behavior +patterns induced by road structures. Therefore, we propose a multi-agent +trajectory prediction framework at signalized intersections dedicated to +Infrastructure-to-Everything (I2XTraj). Our framework leverages dynamic graph +attention to integrate knowledge from traffic signals and driving behaviors. A +continuous signal-informed mechanism is proposed to adaptively process +real-time traffic signals from infrastructure devices. Additionally, leveraging +the prior knowledge of the intersection topology, we propose a driving strategy +awareness mechanism to model the joint distribution of goal intentions and +maneuvers. To the best of our knowledge, I2XTraj represents the first +multi-agent trajectory prediction framework explicitly designed for +infrastructure deployment, supplying subscribable prediction services to all +vehicles at intersections. I2XTraj demonstrates state-of-the-art performance on +both the Vehicle-to-Infrastructure dataset V2X-Seq and the aerial-view dataset +SinD for signalized intersections. Quantitative evaluations show that our +approach outperforms existing methods by more than 30% in both multi-agent and +single-agent scenarios. + +
+
+
+
+
+ + ☆ Zero-Shot Trajectory Planning for Signal Temporal Logic Tasks + + +
+ Signal Temporal Logic (STL) is a powerful specification language for +describing complex temporal behaviors of continuous signals, making it +well-suited for high-level robotic task descriptions. However, generating +executable plans for STL tasks is challenging, as it requires consideration of +the coupling between the task specification and the system dynamics. Existing +approaches either follow a model-based setting that explicitly requires +knowledge of the system dynamics or adopt a task-oriented data-driven approach +to learn plans for specific tasks. In this work, we investigate the problem of +generating executable STL plans for systems whose dynamics are unknown a +priori. We propose a new planning framework that uses only task-agnostic data +during the offline training stage, enabling zero-shot generalization to new STL +tasks. Our framework is hierarchical, involving: (i) decomposing the STL task +into a set of progress and time constraints, (ii) searching for time-aware +waypoints guided by task-agnostic data, and (iii) generating trajectories using +a pre-trained safe diffusion model. Simulation results demonstrate the +effectiveness of our method indeed in achieving zero-shot generalization to +various STL tasks. + +
+
+ comment: submitted +
+
+
+
+
+ + ☆ Emotion estimation from video footage with LSTM + + +
+ Emotion estimation in general is a field that has been studied for a long +time, and several approaches exist using machine learning. in this paper, we +present an LSTM model, that processes the blend-shapes produced by the library +MediaPipe, for a face detected in a live stream of a camera, to estimate the +main emotion from the facial expressions, this model is trained on the FER2013 +dataset and delivers a result of 71% accuracy and 62% f1-score which meets the +accuracy benchmark of the FER2013 dataset, with significantly reduced +computation costs. https://github.com/ +Samir-atra/Emotion_estimation_from_video_footage_with_LSTM_ML_algorithm + +
+
+ comment: 11 pages, 6 figures, 32 references, 4 tables +
+
+
+
+
+ + ☆ GeomGS: LiDAR-Guided Geometry-Aware Gaussian Splatting for Robot + Localization + + +
+ Mapping and localization are crucial problems in robotics and autonomous +driving. Recent advances in 3D Gaussian Splatting (3DGS) have enabled precise +3D mapping and scene understanding by rendering photo-realistic images. +However, existing 3DGS methods often struggle to accurately reconstruct a 3D +map that reflects the actual scale and geometry of the real world, which +degrades localization performance. To address these limitations, we propose a +novel 3DGS method called Geometry-Aware Gaussian Splatting (GeomGS). This +method fully integrates LiDAR data into 3D Gaussian primitives via a +probabilistic approach, as opposed to approaches that only use LiDAR as initial +points or introduce simple constraints for Gaussian points. To this end, we +introduce a Geometric Confidence Score (GCS), which identifies the structural +reliability of each Gaussian point. The GCS is optimized simultaneously with +Gaussians under probabilistic distance constraints to construct a precise +structure. Furthermore, we propose a novel localization method that fully +utilizes both the geometric and photometric properties of GeomGS. Our GeomGS +demonstrates state-of-the-art geometric and localization performance across +several benchmarks, while also improving photometric performance. + +
+
+ comment: Preprint, Under review +
+
+
+
+
+ + ☆ M3PT: A Transformer for Multimodal, Multi-Party Social Signal Prediction + with Person-aware Blockwise Attention + + +
+ Understanding social signals in multi-party conversations is important for +human-robot interaction and artificial social intelligence. Multi-party +interactions include social signals like body pose, head pose, speech, and +context-specific activities like acquiring and taking bites of food when +dining. Incorporating all the multimodal signals in a multi-party interaction +is difficult, and past work tends to build task-specific models for predicting +social signals. In this work, we address the challenge of predicting multimodal +social signals in multi-party settings in a single model. We introduce M3PT, a +causal transformer architecture with modality and temporal blockwise attention +masking which allows for the simultaneous processing of multiple social cues +across multiple participants and their temporal interactions. This approach +better captures social dynamics over time by considering longer horizons of +social signals between individuals. We train and evaluate our unified model on +the Human-Human Commensality Dataset (HHCD), and demonstrate that using +multiple modalities improves bite timing and speaking status prediction. Source +code: https://github.com/AbrarAnwar/masked-social-signals/ + +
+
+
+
+
+ + ☆ VIGS SLAM: IMU-based Large-Scale 3D Gaussian Splatting SLAM + + +
+ Recently, map representations based on radiance fields such as 3D Gaussian +Splatting and NeRF, which excellent for realistic depiction, have attracted +considerable attention, leading to attempts to combine them with SLAM. While +these approaches can build highly realistic maps, large-scale SLAM still +remains a challenge because they require a large number of Gaussian images for +mapping and adjacent images as keyframes for tracking. We propose a novel 3D +Gaussian Splatting SLAM method, VIGS SLAM, that utilizes sensor fusion of RGB-D +and IMU sensors for large-scale indoor environments. To reduce the +computational load of 3DGS-based tracking, we adopt an ICP-based tracking +framework that combines IMU preintegration to provide a good initial guess for +accurate pose estimation. Our proposed method is the first to propose that +Gaussian Splatting-based SLAM can be effectively performed in large-scale +environments by integrating IMU sensor measurements. This proposal not only +enhances the performance of Gaussian Splatting SLAM beyond room-scale scenarios +but also achieves SLAM performance comparable to state-of-the-art methods in +large-scale indoor environments. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ CuriousBot: Interactive Mobile Exploration via Actionable 3D Relational + Object Graph + + +
+ Mobile exploration is a longstanding challenge in robotics, yet current +methods primarily focus on active perception instead of active interaction, +limiting the robot's ability to interact with and fully explore its +environment. Existing robotic exploration approaches via active interaction are +often restricted to tabletop scenes, neglecting the unique challenges posed by +mobile exploration, such as large exploration spaces, complex action spaces, +and diverse object relations. In this work, we introduce a 3D relational object +graph that encodes diverse object relations and enables exploration through +active interaction. We develop a system based on this representation and +evaluate it across diverse scenes. Our qualitative and quantitative results +demonstrate the system's effectiveness and generalization capabilities, +outperforming methods that rely solely on vision-language models (VLMs). + +
+
+ comment: Project Page: https://curiousbot.theaiinstitute.com/ +
+
+
+
+
+ + ☆ The Perceived Danger (PD) Scale: Development and Validation + + +
+ There are currently no psychometrically valid tools to measure the perceived +danger of robots. To fill this gap, we provided a definition of perceived +danger and developed and validated a 12-item bifactor scale through four +studies. An exploratory factor analysis revealed four subdimensions of +perceived danger: affective states, physical vulnerability, ominousness, and +cognitive readiness. A confirmatory factor analysis confirmed the bifactor +model. We then compared the perceived danger scale to the Godspeed perceived +safety scale and found that the perceived danger scale is a better predictor of +empirical data. We also validated the scale in an in-person setting and found +that the perceived danger scale is sensitive to robot speed manipulations, +consistent with previous empirical findings. Results across experiments suggest +that the perceived danger scale is reliable, valid, and an adequate predictor +of both perceived safety and perceived danger in human-robot interaction +contexts. + +
+
+ comment: 9 pages, 2 figures, to be published in the Proceedings of the 2025 + ACM/IEEE International Conference on Human-Robot Interaction (HRI) +
+
+
+
+
+ + ☆ Integrating Persian Lip Reading in Surena-V Humanoid Robot for + Human-Robot Interaction + + +
+ Lip reading is vital for robots in social settings, improving their ability +to understand human communication. This skill allows them to communicate more +easily in crowded environments, especially in caregiving and customer service +roles. Generating a Persian Lip-reading dataset, this study integrates Persian +lip-reading technology into the Surena-V humanoid robot to improve its speech +recognition capabilities. Two complementary methods are explored, an indirect +method using facial landmark tracking and a direct method leveraging +convolutional neural networks (CNNs) and long short-term memory (LSTM) +networks. The indirect method focuses on tracking key facial landmarks, +especially around the lips, to infer movements, while the direct method +processes raw video data for action and speech recognition. The best-performing +model, LSTM, achieved 89\% accuracy and has been successfully implemented into +the Surena-V robot for real-time human-robot interaction. The study highlights +the effectiveness of these methods, particularly in environments where verbal +communication is limited. + +
+
+
+
+
+ + ☆ CSAOT: Cooperative Multi-Agent System for Active Object Tracking + + +
+ Object Tracking is essential for many computer vision applications, such as +autonomous navigation, surveillance, and robotics. Unlike Passive Object +Tracking (POT), which relies on static camera viewpoints to detect and track +objects across consecutive frames, Active Object Tracking (AOT) requires a +controller agent to actively adjust its viewpoint to maintain visual contact +with a moving target in complex environments. Existing AOT solutions are +predominantly single-agent-based, which struggle in dynamic and complex +scenarios due to limited information gathering and processing capabilities, +often resulting in suboptimal decision-making. Alleviating these limitations +necessitates the development of a multi-agent system where different agents +perform distinct roles and collaborate to enhance learning and robustness in +dynamic and complex environments. Although some multi-agent approaches exist +for AOT, they typically rely on external auxiliary agents, which require +additional devices, making them costly. In contrast, we introduce the +Collaborative System for Active Object Tracking (CSAOT), a method that +leverages multi-agent deep reinforcement learning (MADRL) and a Mixture of +Experts (MoE) framework to enable multiple agents to operate on a single +device, thereby improving tracking performance and reducing costs. Our approach +enhances robustness against occlusions and rapid motion while optimizing camera +movements to extend tracking duration. We validated the effectiveness of CSAOT +on various interactive maps with dynamic and stationary obstacles. + +
+
+
+
+
+ + ☆ MCRL4OR: Multimodal Contrastive Representation Learning for Off-Road + Environmental Perception + + +
+ Most studies on environmental perception for autonomous vehicles (AVs) focus +on urban traffic environments, where the objects/stuff to be perceived are +mainly from man-made scenes and scalable datasets with dense annotations can be +used to train supervised learning models. By contrast, it is hard to densely +annotate a large-scale off-road driving dataset manually due to the inherently +unstructured nature of off-road environments. In this paper, we propose a +Multimodal Contrastive Representation Learning approach for Off-Road +environmental perception, namely MCRL4OR. This approach aims to jointly learn +three encoders for processing visual images, locomotion states, and control +actions by aligning the locomotion states with the fused features of visual +images and control actions within a contrastive learning framework. The +causation behind this alignment strategy is that the inertial locomotion state +is the result of taking a certain control action under the current +landform/terrain condition perceived by visual sensors. In experiments, we +pre-train the MCRL4OR with a large-scale off-road driving dataset and adopt the +learned multimodal representations for various downstream perception tasks in +off-road driving scenarios. The superior performance in downstream tasks +demonstrates the advantages of the pre-trained multimodal representations. The +codes can be found in \url{https://github.com/1uciusy/MCRL4OR}. + +
+
+ comment: Github repository: https://github.com/1uciusy/MCRL4OR +
+
+
+
+
+ + ♻ ☆ Entanglement Definitions for Tethered Robots: Exploration and Analysis + + +
+ In this article we consider the problem of tether entanglement for tethered +mobile robots. One of the main risks of using a tethered connection between a +mobile robot and an anchor point is that the tether may get entangled with the +obstacles present in the environment or with itself. To avoid these situations, +a non-entanglement constraint can be considered in the motion planning problem +for tethered robots. This constraint is typically expressed as a set of +specific tether configurations that must be avoided. However, the literature +lacks a generally accepted definition of entanglement, with existing +definitions being limited and partial in the sense that they only focus on +specific instances of entanglement. In practice, this means that the existing +definitions do not effectively cover all instances of tether entanglement. Our +goal in this article is to bridge this gap and to provide new definitions of +entanglement, which, together with the existing ones, can be effectively used +to qualify the entanglement state of a tethered robot in diverse situations. +The new definitions find application in motion planning for tethered robots, +where they can be used to obtain more safe and robust entanglement-free +trajectories. + +
+
+ comment: 18 pages, 9 figures. Published on IEEE Access +
+
+
+
+
+ + ♻ ☆ TrojanRobot: Physical-World Backdoor Attacks Against VLM-based Robotic + Manipulation + + +
+ Robotic manipulation in the physical world is increasingly empowered by +\textit{large language models} (LLMs) and \textit{vision-language models} +(VLMs), leveraging their understanding and perception capabilities. Recently, +various attacks against such robotic policies have been proposed, with backdoor +attacks drawing considerable attention for their high stealth and strong +persistence capabilities. However, existing backdoor efforts are limited to +simulators and suffer from physical-world realization. To address this, we +propose \textit{TrojanRobot}, a highly stealthy and broadly effective robotic +backdoor attack in the physical world. Specifically, we introduce a +module-poisoning approach by embedding a backdoor module into the modular +robotic policy, enabling backdoor control over the policy's visual perception +module thereby backdooring the entire robotic policy. Our vanilla +implementation leverages a backdoor-finetuned VLM to serve as the backdoor +module. To enhance its generalization in physical environments, we propose a +prime implementation, leveraging the LVLM-as-a-backdoor paradigm and developing +three types of prime attacks, \ie, \textit{permutation}, \textit{stagnation}, +and \textit{intentional} attacks, thus achieving finer-grained backdoors. +Extensive experiments on the UR3e manipulator with 18 task instructions using +robotic policies based on four VLMs demonstrate the broad effectiveness and +physical-world stealth of TrojanRobot. Our attack's video demonstrations are +available via a github link \url{https://trojanrobot.github.io}. + +
+
+
+
+
+ + ♻ ☆ MOB-Net: Limb-modularized Uncertainty Torque Learning of Humanoids for + Sensorless External Torque Estimation IJRR + + +
+ Momentum observer (MOB) can estimate external joint torque without requiring +additional sensors, such as force/torque or joint torque sensors. However, the +estimation performance of MOB deteriorates due to the model uncertainty which +encompasses the modeling errors and the joint friction. Moreover, the +estimation error is significant when MOB is applied to high-dimensional +floating-base humanoids, which prevents the estimated external joint torque +from being used for force control or collision detection in the real humanoid +robot. In this paper, the pure external joint torque estimation method named +MOB-Net, is proposed for humanoids. MOB-Net learns the model uncertainty torque +and calibrates the estimated signal of MOB. The external joint torque can be +estimated in the generalized coordinate including whole-body and virtual joints +of the floating-base robot with only internal sensors (an IMU on the pelvis and +encoders in the joints). Our method substantially reduces the estimation errors +of MOB, and the robust performance of MOB-Net for the unseen data is validated +through extensive simulations, real robot experiments, and ablation studies. +Finally, various collision handling scenarios are presented using the estimated +external joint torque from MOB-Net: contact wrench feedback control for +locomotion, collision detection, and collision reaction for safety. + +
+
+ comment: Published to IJRR +
+
+
+
+
+ + ♻ ☆ Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid + Prototyping in Virtual Reality Applications + + +
+ SLAM is a foundational technique with broad applications in robotics and +AR/VR. SLAM simulations evaluate new concepts, but testing on +resource-constrained devices, such as VR HMDs, faces challenges: high +computational cost and restricted sensor data access. This work proposes a +sparse framework using mesh geometry projections as features, which improves +efficiency and circumvents direct sensor data access, advancing SLAM research +as we demonstrate in VR and through numerical evaluation. + +
+
+ comment: Accepted to ENPT XR at IEEE VR 2025 +
+
+
+
+
+ + ♻ ☆ Design Optimizer for Soft Growing Robot Manipulators in + Three-Dimensional Environments + + +
+ Soft growing robots are novel devices that mimic plant-like growth for +navigation in cluttered or dangerous environments. Their ability to adapt to +surroundings, combined with advancements in actuation and manufacturing +technologies, allows them to perform specialized manipulation tasks. This work +presents an approach for design optimization of soft growing robots; +specifically, the three-dimensional extension of the optimizer designed for +planar manipulators. This tool is intended to be used by engineers and robot +enthusiasts before manufacturing their robot: it suggests the optimal size of +the robot for solving a specific task. The design process models a +multi-objective optimization problem to refine a soft manipulator's kinematic +chain. Thanks to the novel Rank Partitioning algorithm integrated into +Evolutionary Computation (EC) algorithms, this method achieves high precision +in reaching targets and is efficient in resource usage. Results show +significantly high performance in solving three-dimensional tasks, whereas +comparative experiments indicate that the optimizer features robust output when +tested with different EC algorithms, particularly genetic algorithms. + +
+
+ comment: 17 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Generative Graphical Inverse Kinematics + + +
+ Quickly and reliably finding accurate inverse kinematics (IK) solutions +remains a challenging problem for many robot manipulators. Existing numerical +solvers are broadly applicable but typically only produce a single solution and +rely on local search techniques to minimize nonconvex objective functions. More +recent learning-based approaches that approximate the entire feasible set of +solutions have shown promise as a means to generate multiple fast and accurate +IK results in parallel. However, existing learning-based techniques have a +significant drawback: each robot of interest requires a specialized model that +must be trained from scratch. To address this key shortcoming, we propose a +novel distance-geometric robot representation coupled with a graph structure +that allows us to leverage the sample efficiency of Euclidean equivariant +functions and the generalizability of graph neural networks (GNNs). Our +approach is generative graphical inverse kinematics (GGIK), the first learned +IK solver able to accurately and efficiently produce a large number of diverse +solutions in parallel while also displaying the ability to generalize -- a +single learned model can be used to produce IK solutions for a variety of +different robots. When compared to several other learned IK methods, GGIK +provides more accurate solutions with the same amount of data. GGIK can +generalize reasonably well to robot manipulators unseen during training. +Additionally, GGIK can learn a constrained distribution that encodes joint +limits and scales efficiently to larger robots and a high number of sampled +solutions. Finally, GGIK can be used to complement local IK solvers by +providing reliable initializations for a local optimization process. + +
+
+ comment: 17 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ AdaWM: Adaptive World Model based Planning for Autonomous Driving ICLR 2025 + + +
+ World model based reinforcement learning (RL) has emerged as a promising +approach for autonomous driving, which learns a latent dynamics model and uses +it to train a planning policy. To speed up the learning process, the +pretrain-finetune paradigm is often used, where online RL is initialized by a +pretrained model and a policy learned offline. However, naively performing such +initialization in RL may result in dramatic performance degradation during the +online interactions in the new task. To tackle this challenge, we first analyze +the performance degradation and identify two primary root causes therein: the +mismatch of the planning policy and the mismatch of the dynamics model, due to +distribution shift. We further analyze the effects of these factors on +performance degradation during finetuning, and our findings reveal that the +choice of finetuning strategies plays a pivotal role in mitigating these +effects. We then introduce AdaWM, an Adaptive World Model based planning +method, featuring two key steps: (a) mismatch identification, which quantifies +the mismatches and informs the finetuning strategy, and (b) alignment-driven +finetuning, which selectively updates either the policy or the model as needed +using efficient low-rank updates. Extensive experiments on the challenging +CARLA driving tasks demonstrate that AdaWM significantly improves the +finetuning process, resulting in more robust and efficient performance in +autonomous driving systems. + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Graph Optimality-Aware Stochastic LiDAR Bundle Adjustment with + Progressive Spatial Smoothing + + +
+ Large-scale LiDAR Bundle Adjustment (LBA) to refine sensor orientation and +point cloud accuracy simultaneously to build the navigation map is a +fundamental task in logistics and robotics. Unlike pose-graph-based methods +that rely solely on pairwise relationships between LiDAR frames, LBA leverages +raw LiDAR correspondences to achieve more precise results, especially when +initial pose estimates are unreliable for low-cost sensors. However, existing +LBA methods face challenges such as simplistic planar correspondences, +extensive observations, and dense normal matrices in the least-squares problem, +which limit robustness, efficiency, and scalability. To address these issues, +we propose a Graph Optimality-aware Stochastic Optimization scheme with +Progressive Spatial Smoothing, namely PSS-GOSO, to achieve \textit{robust}, +\textit{efficient}, and \textit{scalable} LBA. The Progressive Spatial +Smoothing (PSS) module extracts \textit{robust} LiDAR feature association +exploiting the prior structure information obtained by the polynomial smooth +kernel. The Graph Optimality-aware Stochastic Optimization (GOSO) module first +sparsifies the graph according to optimality for an \textit{efficient} +optimization. GOSO then utilizes stochastic clustering and graph +marginalization to solve the large-scale state estimation problem for a +\textit{scalable} LBA. We validate PSS-GOSO across diverse scenes captured by +various platforms, demonstrating its superior performance compared to existing +methods. Moreover, the resulting point cloud maps are used for automatic +last-mile delivery in large-scale complex scenes. The project page can be found +at: \url{https://kafeiyin00.github.io/PSS-GOSO/}. + +
+
+
+
+
+ + ♻ ☆ SpatialCoT: Advancing Spatial Reasoning through Coordinate Alignment and + Chain-of-Thought for Embodied Task Planning + + +
+ Spatial reasoning is an essential problem in embodied AI research. Efforts to +enhance spatial reasoning abilities through supplementary spatial data and +fine-tuning have proven limited and ineffective when addressing complex +embodied tasks, largely due to their dependence on language-based outputs. +While some approaches have introduced a point-based action space to mitigate +this issue, they fall short in managing more intricate tasks within complex +environments. This deficiency arises from their failure to fully exploit the +inherent thinking and reasoning capabilities that are fundamental strengths of +Vision-Language Models (VLMs). To address these limitations, we propose a novel +approach named SpatialCoT, specifically designed to bolster the spatial +reasoning capabilities of VLMs. Our approach comprises two stages: spatial +coordinate bi-directional alignment, which aligns vision-language inputs with +spatial coordinates, and chain-of-thought spatial grounding, which harnesses +the reasoning capabilities of language models for advanced spatial reasoning. +We evaluate SpatialCoT on challenging navigation and manipulation tasks, both +in simulation and real-world settings. Experimental results demonstrate that +our method significantly outperforms previous state-of-the-art approaches in +both tasks. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ POLAR-Sim: Augmenting NASA's POLAR Dataset for Data-Driven Lunar + Perception and Rover Simulation + + +
+ NASA's POLAR dataset contains approximately 2,600 pairs of high dynamic range +stereo photos captured across 13 varied terrain scenarios, including areas with +sparse or dense rock distributions, craters, and rocks of different sizes. The +purpose of these photos is to spur development in robotics, AI-based +perception, and autonomous navigation. Acknowledging a scarcity of lunar images +from around the lunar poles, NASA Ames produced on Earth but in controlled +conditions images that resemble rover operating conditions from these regions +of the Moon. We report on the outcomes of an effort aimed at accomplishing two +tasks. In Task 1, we provided bounding boxes and semantic segmentation +information for all the images in NASA's POLAR dataset. This effort resulted in +23,000 labels and semantic segmentation annotations pertaining to rocks, +shadows, and craters. In Task 2, we generated the digital twins of the 13 +scenarios that have been used to produce all the photos in the POLAR dataset. +Specifically, for each of these scenarios, we produced individual meshes, +texture information, and material properties associated with the ground and the +rocks in each scenario. This allows anyone with a camera model to synthesize +images associated with any of the 13 scenarios of the POLAR dataset. +Effectively, one can generate as many semantically labeled synthetic images as +desired -- with different locations and exposure values in the scene, for +different positions of the sun, with or without the presence of active +illumination, etc. The benefit of this work is twofold. Using outcomes of Task +1, one can train and/or test perception algorithms that deal with Moon images. +For Task 2, one can produce as much data as desired to train and test AI +algorithms that are anticipated to work in lunar conditions. All the outcomes +of this work are available in a public repository for unfettered use and +distribution. + +
+
+ comment: 11 pages, 9 figures. This work has been submitted to the IEEE for + possible publication +
+
+
+
+
+ + ♻ ☆ Design and Implementation of an Efficient Onboard Computer System for + CanSat Atmosphere Monitoring + + +
+ With advancements in technology, the smaller versions of satellites have +gained momentum in the space industry for earth monitoring and +communication-based applications. The rise of CanSat technology has +significantly impacted the space industry by providing a cost-effective +solution for space exploration. CanSat is a simulation model of a real +satellite and plays a crucial role in collecting and transmitting atmospheric +data. This paper discusses the design of an Onboard Computer System forCanSat, +used to study various environmental parameters by monitoring the concentrations +of gases in the atmosphere. The Onboard Computer System uses GPS, +accelerometer, altitude, temperature, pressure, gyroscope, magnetometer, UV +radiation, and air quality sensors for atmospheric sensing. A highly efficient +and low-power ESP32 microcontroller and a transceiver module are used to +acquire data, facilitate seamless communication and transmit the collected data +to the ground station. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 145 + +
+
+
+ + ☆ Fast3R: Towards 3D Reconstruction of 1000+ Images in One Forward Pass + + +
+ Multi-view 3D reconstruction remains a core challenge in computer vision, +particularly in applications requiring accurate and scalable representations +across diverse perspectives. Current leading methods such as DUSt3R employ a +fundamentally pairwise approach, processing images in pairs and necessitating +costly global alignment procedures to reconstruct from multiple views. In this +work, we propose Fast 3D Reconstruction (Fast3R), a novel multi-view +generalization to DUSt3R that achieves efficient and scalable 3D reconstruction +by processing many views in parallel. Fast3R's Transformer-based architecture +forwards N images in a single forward pass, bypassing the need for iterative +alignment. Through extensive experiments on camera pose estimation and 3D +reconstruction, Fast3R demonstrates state-of-the-art performance, with +significant improvements in inference speed and reduced error accumulation. +These results establish Fast3R as a robust alternative for multi-view +applications, offering enhanced scalability without compromising reconstruction +accuracy. + +
+
+ comment: Project website: https://fast3r-3d.github.io/ +
+
+
+
+
+ + ☆ CRPO: Confidence-Reward Driven Preference Optimization for Machine + Translation + + +
+ Large language models (LLMs) have shown great potential in natural language +processing tasks, but their application to machine translation (MT) remains +challenging due to pretraining on English-centric data and the complexity of +reinforcement learning from human feedback (RLHF). Direct Preference +Optimization (DPO) has emerged as a simpler and more efficient alternative, but +its performance depends heavily on the quality of preference data. To address +this, we propose Confidence-Reward driven Preference Optimization (CRPO), a +novel method that combines reward scores with model confidence to improve data +selection for fine-tuning. CRPO selects challenging sentence pairs where the +model is uncertain or underperforms, leading to more effective learning. While +primarily designed for LLMs, CRPO also generalizes to encoder-decoder models +like NLLB, demonstrating its versatility. Empirical results show that CRPO +outperforms existing methods such as RS-DPO, RSO and MBR score in both +translation accuracy and data efficiency. + +
+
+
+
+
+ + ☆ Can We Generate Images with CoT? Let's Verify and Reinforce Image + Generation Step by Step + + +
+ Chain-of-Thought (CoT) reasoning has been extensively explored in large +models to tackle complex understanding tasks. However, it still remains an open +question whether such strategies can be applied to verifying and reinforcing +image generation scenarios. In this paper, we provide the first comprehensive +investigation of the potential of CoT reasoning to enhance autoregressive image +generation. We focus on three techniques: scaling test-time computation for +verification, aligning model preferences with Direct Preference Optimization +(DPO), and integrating these techniques for complementary effects. Our results +demonstrate that these approaches can be effectively adapted and combined to +significantly improve image generation performance. Furthermore, given the +pivotal role of reward models in our findings, we propose the Potential +Assessment Reward Model (PARM) and PARM++, specialized for autoregressive image +generation. PARM adaptively assesses each generation step through a potential +assessment approach, merging the strengths of existing reward models, and +PARM++ further introduces a reflection mechanism to self-correct the generated +unsatisfactory image. Using our investigated reasoning strategies, we enhance a +baseline model, Show-o, to achieve superior results, with a significant +24% +improvement on the GenEval benchmark, surpassing Stable Diffusion 3 by +15%. We +hope our study provides unique insights and paves a new path for integrating +CoT reasoning with autoregressive image generation. Code and models are +released at https://github.com/ZiyuGuo99/Image-Generation-CoT + +
+
+ comment: Journal Version. Code and models are released at + https://github.com/ZiyuGuo99/Image-Generation-CoT +
+
+
+
+
+ + ☆ Towards Robust Multimodal Open-set Test-time Adaptation via Adaptive + Entropy-aware Optimization ICLR 2025 + + +
+ Test-time adaptation (TTA) has demonstrated significant potential in +addressing distribution shifts between training and testing data. Open-set +test-time adaptation (OSTTA) aims to adapt a source pre-trained model online to +an unlabeled target domain that contains unknown classes. This task becomes +more challenging when multiple modalities are involved. Existing methods have +primarily focused on unimodal OSTTA, often filtering out low-confidence samples +without addressing the complexities of multimodal data. In this work, we +present Adaptive Entropy-aware Optimization (AEO), a novel framework +specifically designed to tackle Multimodal Open-set Test-time Adaptation +(MM-OSTTA) for the first time. Our analysis shows that the entropy difference +between known and unknown samples in the target domain strongly correlates with +MM-OSTTA performance. To leverage this, we propose two key components: +Unknown-aware Adaptive Entropy Optimization (UAE) and Adaptive Modality +Prediction Discrepancy Optimization (AMP). These components enhance the ability +of model to distinguish unknown class samples during online adaptation by +amplifying the entropy difference between known and unknown samples. To +thoroughly evaluate our proposed methods in the MM-OSTTA setting, we establish +a new benchmark derived from existing datasets. This benchmark includes two +downstream tasks and incorporates five modalities. Extensive experiments across +various domain shift situations demonstrate the efficacy and versatility of the +AEO framework. Additionally, we highlight the strong performance of AEO in +long-term and continual MM-OSTTA settings, both of which are challenging and +highly relevant to real-world applications. Our source code is available at +https://github.com/donghao51/AEO. + +
+
+ comment: Accepted by ICLR 2025 +
+
+
+
+
+ + ☆ GeoPixel: Pixel Grounding Large Multimodal Model in Remote Sensing + + +
+ Recent advances in large multimodal models (LMMs) have recognized +fine-grained grounding as an imperative factor of visual understanding and +dialogue. However, the benefits of such representation in LMMs are limited to +the natural image domain, and these models perform poorly for remote sensing +(RS). The distinct overhead viewpoint, scale variation, and presence of small +objects in high-resolution RS imagery present a unique challenge in +region-level comprehension. Moreover, the development of the grounding +conversation capability of LMMs within RS is hindered by the lack of granular, +RS domain-specific grounded data. Addressing these limitations, we propose +GeoPixel - the first end-to-end high resolution RS-LMM that supports +pixel-level grounding. This capability allows fine-grained visual perception by +generating interleaved masks in conversation. GeoPixel supports up to 4K HD +resolution in any aspect ratio, ideal for high-precision RS image analysis. To +support the grounded conversation generation (GCG) in RS imagery, we curate a +visually grounded dataset GeoPixelD through a semi-automated pipeline that +utilizes set-of-marks prompting and spatial priors tailored for RS data to +methodically control the data generation process. GeoPixel demonstrates +superior performance in pixel-level comprehension, surpassing existing LMMs in +both single-target and multi-target segmentation tasks. Our methodological +ablation studies validate the effectiveness of each component in the overall +architecture. Our code and data will be publicly released. + +
+
+
+
+
+ + ☆ IMAGINE-E: Image Generation Intelligence Evaluation of State-of-the-art + Text-to-Image Models + + +
+ With the rapid development of diffusion models, text-to-image(T2I) models +have made significant progress, showcasing impressive abilities in prompt +following and image generation. Recently launched models such as FLUX.1 and +Ideogram2.0, along with others like Dall-E3 and Stable Diffusion 3, have +demonstrated exceptional performance across various complex tasks, raising +questions about whether T2I models are moving towards general-purpose +applicability. Beyond traditional image generation, these models exhibit +capabilities across a range of fields, including controllable generation, image +editing, video, audio, 3D, and motion generation, as well as computer vision +tasks like semantic segmentation and depth estimation. However, current +evaluation frameworks are insufficient to comprehensively assess these models' +performance across expanding domains. To thoroughly evaluate these models, we +developed the IMAGINE-E and tested six prominent models: FLUX.1, Ideogram2.0, +Midjourney, Dall-E3, Stable Diffusion 3, and Jimeng. Our evaluation is divided +into five key domains: structured output generation, realism, and physical +consistency, specific domain generation, challenging scenario generation, and +multi-style creation tasks. This comprehensive assessment highlights each +model's strengths and limitations, particularly the outstanding performance of +FLUX.1 and Ideogram2.0 in structured and specific domain tasks, underscoring +the expanding applications and potential of T2I models as foundational AI +tools. This study provides valuable insights into the current state and future +trajectory of T2I models as they evolve towards general-purpose usability. +Evaluation scripts will be released at https://github.com/jylei16/Imagine-e. + +
+
+ comment: 75 pages, 73 figures, Evaluation scripts: + https://github.com/jylei16/Imagine-e +
+
+
+
+
+ + ☆ Temporal Preference Optimization for Long-Form Video Understanding + + +
+ Despite significant advancements in video large multimodal models +(video-LMMs), achieving effective temporal grounding in long-form videos +remains a challenge for existing models. To address this limitation, we propose +Temporal Preference Optimization (TPO), a novel post-training framework +designed to enhance the temporal grounding capabilities of video-LMMs through +preference learning. TPO adopts a self-training approach that enables models to +differentiate between well-grounded and less accurate temporal responses by +leveraging curated preference datasets at two granularities: localized temporal +grounding, which focuses on specific video segments, and comprehensive temporal +grounding, which captures extended temporal dependencies across entire video +sequences. By optimizing on these preference datasets, TPO significantly +enhances temporal understanding while reducing reliance on manually annotated +data. Extensive experiments on three long-form video understanding +benchmarks--LongVideoBench, MLVU, and Video-MME--demonstrate the effectiveness +of TPO across two state-of-the-art video-LMMs. Notably, LLaVA-Video-TPO +establishes itself as the leading 7B model on the Video-MME benchmark, +underscoring the potential of TPO as a scalable and efficient solution for +advancing temporal reasoning in long-form video understanding. Project page: +https://ruili33.github.io/tpo_website. + +
+
+
+
+
+ + ☆ Improving Video Generation with Human Feedback + + +
+ Video generation has achieved significant advances through rectified flow +techniques, but issues like unsmooth motion and misalignment between videos and +prompts persist. In this work, we develop a systematic pipeline that harnesses +human feedback to mitigate these problems and refine the video generation +model. Specifically, we begin by constructing a large-scale human preference +dataset focused on modern video generation models, incorporating pairwise +annotations across multi-dimensions. We then introduce VideoReward, a +multi-dimensional video reward model, and examine how annotations and various +design choices impact its rewarding efficacy. From a unified reinforcement +learning perspective aimed at maximizing reward with KL regularization, we +introduce three alignment algorithms for flow-based models by extending those +from diffusion models. These include two training-time strategies: direct +preference optimization for flow (Flow-DPO) and reward weighted regression for +flow (Flow-RWR), and an inference-time technique, Flow-NRG, which applies +reward guidance directly to noisy videos. Experimental results indicate that +VideoReward significantly outperforms existing reward models, and Flow-DPO +demonstrates superior performance compared to both Flow-RWR and standard +supervised fine-tuning methods. Additionally, Flow-NRG lets users assign custom +weights to multiple objectives during inference, meeting personalized video +quality needs. Project page: https://gongyeliu.github.io/videoalign. + +
+
+
+
+
+ + ☆ Binary Diffusion Probabilistic Model + + +
+ We introduce the Binary Diffusion Probabilistic Model (BDPM), a novel +generative model optimized for binary data representations. While denoising +diffusion probabilistic models (DDPMs) have demonstrated notable success in +tasks like image synthesis and restoration, traditional DDPMs rely on +continuous data representations and mean squared error (MSE) loss for training, +applying Gaussian noise models that may not be optimal for discrete or binary +data structures. BDPM addresses this by decomposing images into bitplanes and +employing XOR-based noise transformations, with a denoising model trained using +binary cross-entropy loss. This approach enables precise noise control and +computationally efficient inference, significantly lowering computational costs +and improving model convergence. When evaluated on image restoration tasks such +as image super-resolution, inpainting, and blind image restoration, BDPM +outperforms state-of-the-art methods on the FFHQ, CelebA, and CelebA-HQ +datasets. Notably, BDPM requires fewer inference steps than traditional DDPM +models to reach optimal results, showcasing enhanced inference efficiency. + +
+
+
+
+
+ + ☆ PointOBB-v3: Expanding Performance Boundaries of Single Point-Supervised + Oriented Object Detection + + +
+ With the growing demand for oriented object detection (OOD), recent studies +on point-supervised OOD have attracted significant interest. In this paper, we +propose PointOBB-v3, a stronger single point-supervised OOD framework. Compared +to existing methods, it generates pseudo rotated boxes without additional +priors and incorporates support for the end-to-end paradigm. PointOBB-v3 +functions by integrating three unique image views: the original view, a resized +view, and a rotated/flipped (rot/flp) view. Based on the views, a scale +augmentation module and an angle acquisition module are constructed. In the +first module, a Scale-Sensitive Consistency (SSC) loss and a Scale-Sensitive +Feature Fusion (SSFF) module are introduced to improve the model's ability to +estimate object scale. To achieve precise angle predictions, the second module +employs symmetry-based self-supervised learning. Additionally, we introduce an +end-to-end version that eliminates the pseudo-label generation process by +integrating a detector branch and introduces an Instance-Aware Weighting (IAW) +strategy to focus on high-quality predictions. We conducted extensive +experiments on the DIOR-R, DOTA-v1.0/v1.5/v2.0, FAIR1M, STAR, and RSAR +datasets. Across all these datasets, our method achieves an average improvement +in accuracy of 3.56% in comparison to previous state-of-the-art methods. The +code will be available at https://github.com/ZpyWHU/PointOBB-v3. + +
+
+ comment: 16 pages, 5 figures, 10 tables +
+
+
+
+
+ + ☆ GUI-Bee: Align GUI Action Grounding to Novel Environments via Autonomous + Exploration + + +
+ Graphical User Interface (GUI) action grounding is a critical step in GUI +automation that maps language instructions to actionable elements on GUI +screens. Most recent works of GUI action grounding leverage large GUI datasets +to fine-tune MLLMs. However, the fine-tuning data always covers limited GUI +environments, and we find the performance of the resulting model deteriorates +in novel environments. We argue that the GUI grounding models should be further +aligned to the novel environments to reveal their full potential, when the +inference is known to involve novel environments, i.e., environments not used +during the previous fine-tuning. To realize this, we first propose GUI-Bee, an +MLLM-based autonomous agent, to collect high-quality, environment-specific data +through exploration and then continuously fine-tune GUI grounding models with +the collected data. Our agent leverages a novel Q-value-Incentive In-Context +Reinforcement Learning (Q-ICRL) method to optimize exploration efficiency and +data quality. Additionally, we introduce NovelScreenSpot, a benchmark for +testing how well the data can help align GUI action grounding models to novel +environments and demonstrate the effectiveness of data collected by GUI-Bee in +the experiments. Furthermore, we conduct an ablation study to validate the +Q-ICRL method in enhancing the efficiency of GUI-Bee. Project page: +https://gui-bee.github.io + +
+
+
+
+
+ + ☆ Pix2Cap-COCO: Advancing Visual Comprehension via Pixel-Level Captioning + + +
+ We present Pix2Cap-COCO, the first panoptic pixel-level caption dataset +designed to advance fine-grained visual understanding. To achieve this, we +carefully design an automated annotation pipeline that prompts GPT-4V to +generate pixel-aligned, instance-specific captions for individual objects +within images, enabling models to learn more granular relationships between +objects and their contexts. This approach results in 167,254 detailed captions, +with an average of 22.94 words per caption. Building on Pix2Cap-COCO, we +introduce a novel task, panoptic segmentation-captioning, which challenges +models to recognize instances in an image and provide detailed descriptions for +each simultaneously. To benchmark this task, we design a robust baseline based +on X-Decoder. The experimental results demonstrate that Pix2Cap-COCO is a +particularly challenging dataset, as it requires models to excel in both +fine-grained visual understanding and detailed language generation. +Furthermore, we leverage Pix2Cap-COCO for Supervised Fine-Tuning (SFT) on large +multimodal models (LMMs) to enhance their performance. For example, training +with Pix2Cap-COCO significantly improves the performance of GPT4RoI, yielding +gains in CIDEr +1.4%, ROUGE +0.4%, and SPICE +0.5% on Visual Genome dataset, +and strengthens its region understanding ability on the ViP-BENCH, with an +overall improvement of +5.1%, including notable increases in recognition +accuracy +11.2% and language generation quality +22.2%. + +
+
+
+
+
+ + ☆ Generating Realistic Forehead-Creases for User Verification via + Conditioned Piecewise Polynomial Curves + + +
+ We propose a trait-specific image generation method that models forehead +creases geometrically using B-spline and B\'ezier curves. This approach ensures +the realistic generation of both principal creases and non-prominent crease +patterns, effectively constructing detailed and authentic forehead-crease +images. These geometrically rendered images serve as visual prompts for a +diffusion-based Edge-to-Image translation model, which generates corresponding +mated samples. The resulting novel synthetic identities are then used to train +a forehead-crease verification network. To enhance intra-subject diversity in +the generated samples, we employ two strategies: (a) perturbing the control +points of B-splines under defined constraints to maintain label consistency, +and (b) applying image-level augmentations to the geometric visual prompts, +such as dropout and elastic transformations, specifically tailored to crease +patterns. By integrating the proposed synthetic dataset with real-world data, +our method significantly improves the performance of forehead-crease +verification systems under a cross-database verification protocol. + +
+
+ comment: Accepted at WACV-W 2025 +
+
+
+
+
+ + ☆ Multimodal Sensor Dataset for Monitoring Older Adults Post Lower-Limb + Fractures in Community Settings + + +
+ Lower-Limb Fractures (LLF) are a major health concern for older adults, often +leading to reduced mobility and prolonged recovery, potentially impairing daily +activities and independence. During recovery, older adults frequently face +social isolation and functional decline, complicating rehabilitation and +adversely affecting physical and mental health. Multi-modal sensor platforms +that continuously collect data and analyze it using machine-learning algorithms +can remotely monitor this population and infer health outcomes. They can also +alert clinicians to individuals at risk of isolation and decline. This paper +presents a new publicly available multi-modal sensor dataset, MAISON-LLF, +collected from older adults recovering from LLF in community settings. The +dataset includes data from smartphone and smartwatch sensors, motion detectors, +sleep-tracking mattresses, and clinical questionnaires on isolation and +decline. The dataset was collected from ten older adults living alone at home +for eight weeks each, totaling 560 days of 24-hour sensor data. For technical +validation, supervised machine-learning and deep-learning models were developed +using the sensor and clinical questionnaire data, providing a foundational +comparison for the research community. + +
+
+
+
+
+ + ☆ Eye Gaze as a Signal for Conveying User Attention in Contextual AI + Systems + + +
+ Advanced multimodal AI agents can now collaborate with users to solve +challenges in the world. We explore eye tracking's role in such interaction to +convey a user's attention relative to the physical environment. We hypothesize +that this knowledge improves contextual understanding for AI agents. By +observing hours of human-object interactions, we first measure the relationship +between an eye tracker's signal quality and its ability to reliably place gaze +on nearby physical objects. We then conduct experiments which relay the user's +scanpath history as additional context querying multimodal agents. Our results +show that eye tracking provides high value as a user attention signal and can +convey information about the user's current task and interests to the agent. + +
+
+
+
+
+ + ☆ Dual-Modal Prototype Joint Learning for Compositional Zero-Shot Learning + + +
+ Compositional Zero-Shot Learning (CZSL) aims to recognize novel compositions +of attributes and objects by leveraging knowledge learned from seen +compositions. Recent approaches have explored the use of Vision-Language Models +(VLMs) to align textual and visual modalities. These methods typically employ +prompt engineering, parameter-tuning, and modality fusion to generate rich +textual prototypes that serve as class prototypes for CZSL. However, the +modality gap results in textual prototypes being unable to fully capture the +optimal representations of all class prototypes, particularly those with +fine-grained features, which can be directly obtained from the visual modality. +In this paper, we propose a novel Dual-Modal Prototype Joint Learning framework +for the CZSL task. Our approach, based on VLMs, introduces prototypes in both +the textual and visual modalities. The textual prototype is optimized to +capture broad conceptual information, aiding the model's generalization across +unseen compositions. Meanwhile, the visual prototype is used to mitigate the +classification errors caused by the modality gap and capture fine-grained +details to distinguish images with similar appearances. To effectively optimize +these prototypes, we design specialized decomposition modules and a joint +learning strategy that enrich the features from both modalities. These +prototypes not only capture key category information during training but also +serve as crucial reference targets during inference. Experimental results +demonstrate that our approach achieves state-of-the-art performance in the +closed-world setting and competitive performance in the open-world setting +across three publicly available CZSL benchmarks. These findings validate the +effectiveness of our method in advancing compositional generalization. + +
+
+
+
+
+ + ☆ First Lessons Learned of an Artificial Intelligence Robotic System for + Autonomous Coarse Waste Recycling Using Multispectral Imaging-Based Methods + + +
+ Current disposal facilities for coarse-grained waste perform manual sorting +of materials with heavy machinery. Large quantities of recyclable materials are +lost to coarse waste, so more effective sorting processes must be developed to +recover them. Two key aspects to automate the sorting process are object +detection with material classification in mixed piles of waste, and autonomous +control of hydraulic machinery. Because most objects in those accumulations of +waste are damaged or destroyed, object detection alone is not feasible in the +majority of cases. To address these challenges, we propose a classification of +materials with multispectral images of ultraviolet (UV), visual (VIS), near +infrared (NIR), and short-wave infrared (SWIR) spectrums. Solution for +autonomous control of hydraulic heavy machines for sorting of bulky waste is +being investigated using cost-effective cameras and artificial +intelligence-based controllers. + +
+
+ comment: Published in Proceedings of Sardinia 2023, 19th International + Symposium on Waste Management, Resource Recovery and Sustainable Landfilling +
+
+
+
+
+ + ☆ Where Do You Go? Pedestrian Trajectory Prediction using Scene Features + + +
+ Accurate prediction of pedestrian trajectories is crucial for enhancing the +safety of autonomous vehicles and reducing traffic fatalities involving +pedestrians. While numerous studies have focused on modeling interactions among +pedestrians to forecast their movements, the influence of environmental factors +and scene-object placements has been comparatively underexplored. In this +paper, we present a novel trajectory prediction model that integrates both +pedestrian interactions and environmental context to improve prediction +accuracy. Our approach captures spatial and temporal interactions among +pedestrians within a sparse graph framework. To account for pedestrian-scene +interactions, we employ advanced image enhancement and semantic segmentation +techniques to extract detailed scene features. These scene and interaction +features are then fused through a cross-attention mechanism, enabling the model +to prioritize relevant environmental factors that influence pedestrian +movements. Finally, a temporal convolutional network processes the fused +features to predict future pedestrian trajectories. Experimental results +demonstrate that our method significantly outperforms existing state-of-the-art +approaches, achieving ADE and FDE values of 0.252 and 0.372 meters, +respectively, underscoring the importance of incorporating both social +interactions and environmental context in pedestrian trajectory prediction. + +
+
+ comment: Accepted by 2024 International Conference on Intelligent Computing + and its Emerging Applications +
+
+
+
+
+ + ☆ MV-GMN: State Space Model for Multi-View Action Recognition + + +
+ Recent advancements in multi-view action recognition have largely relied on +Transformer-based models. While effective and adaptable, these models often +require substantial computational resources, especially in scenarios with +multiple views and multiple temporal sequences. Addressing this limitation, +this paper introduces the MV-GMN model, a state-space model specifically +designed to efficiently aggregate multi-modal data (RGB and skeleton), +multi-view perspectives, and multi-temporal information for action recognition +with reduced computational complexity. The MV-GMN model employs an innovative +Multi-View Graph Mamba network comprising a series of MV-GMN blocks. Each block +includes a proposed Bidirectional State Space Block and a GCN module. The +Bidirectional State Space Block introduces four scanning strategies, including +view-prioritized and time-prioritized approaches. The GCN module leverages +rule-based and KNN-based methods to construct the graph network, effectively +integrating features from different viewpoints and temporal instances. +Demonstrating its efficacy, MV-GMN outperforms the state-of-the-arts on several +datasets, achieving notable accuracies of 97.3\% and 96.7\% on the NTU RGB+D +120 dataset in cross-subject and cross-view scenarios, respectively. MV-GMN +also surpasses Transformer-based baselines while requiring only linear +inference complexity, underscoring the model's ability to reduce computational +load and enhance the scalability and applicability of multi-view action +recognition technologies. + +
+
+
+
+
+ + ☆ Video-MMMU: Evaluating Knowledge Acquisition from Multi-Discipline + Professional Videos + + +
+ Humans acquire knowledge through three cognitive stages: perceiving +information, comprehending knowledge, and adapting knowledge to solve novel +problems. Videos serve as an effective medium for this learning process, +facilitating a progression through these cognitive stages. However, existing +video benchmarks fail to systematically evaluate the knowledge acquisition +capabilities in Large Multimodal Models (LMMs). To address this gap, we +introduce Video-MMMU, a multi-modal, multi-disciplinary benchmark designed to +assess LMMs' ability to acquire and utilize knowledge from videos. Video-MMMU +features a curated collection of 300 expert-level videos and 900 +human-annotated questions across six disciplines, evaluating knowledge +acquisition through stage-aligned question-answer pairs: Perception, +Comprehension, and Adaptation. A proposed knowledge gain metric, +{\Delta}knowledge, quantifies improvement in performance after video viewing. +Evaluation of LMMs reveals a steep decline in performance as cognitive demands +increase and highlights a significant gap between human and model knowledge +acquisition, underscoring the need for methods to enhance LMMs' capability to +learn and adapt from videos. + +
+
+
+
+
+ + ☆ Ensuring Medical AI Safety: Explainable AI-Driven Detection and + Mitigation of Spurious Model Behavior and Associated Data + + +
+ Deep neural networks are increasingly employed in high-stakes medical +applications, despite their tendency for shortcut learning in the presence of +spurious correlations, which can have potentially fatal consequences in +practice. Detecting and mitigating shortcut behavior is a challenging task that +often requires significant labeling efforts from domain experts. To alleviate +this problem, we introduce a semi-automated framework for the identification of +spurious behavior from both data and model perspective by leveraging insights +from eXplainable Artificial Intelligence (XAI). This allows the retrieval of +spurious data points and the detection of model circuits that encode the +associated prediction rules. Moreover, we demonstrate how these shortcut +encodings can be used for XAI-based sample- and pixel-level data annotation, +providing valuable information for bias mitigation methods to unlearn the +undesired shortcut behavior. We show the applicability of our framework using +four medical datasets across two modalities, featuring controlled and +real-world spurious correlations caused by data artifacts. We successfully +identify and mitigate these biases in VGG16, ResNet50, and contemporary Vision +Transformer models, ultimately increasing their robustness and applicability +for real-world medical tasks. + +
+
+
+
+
+ + ☆ By-Example Synthesis of Vector Textures + + +
+ We propose a new method for synthesizing an arbitrarily sized novel vector +texture given a single raster exemplar. Our method first segments the exemplar +to extract the primary textons, and then clusters them based on visual +similarity. We then compute a descriptor to capture each texton's neighborhood +which contains the inter-category relationships that are used at synthesis +time. Next, we use a simple procedure to both extract and place the secondary +textons behind the primary polygons. Finally, our method constructs a gradient +field for the background which is defined by a set of data points and colors. +The color of the secondary polygons are also adjusted to better match the +gradient field. To compare our work with other methods, we use a wide range of +perceptual-based metrics. + +
+
+
+
+
+ + ☆ EgoHand: Ego-centric Hand Pose Estimation and Gesture Recognition with + Head-mounted Millimeter-wave Radar and IMUs + + +
+ Recent advanced Virtual Reality (VR) headsets, such as the Apple Vision Pro, +employ bottom-facing cameras to detect hand gestures and inputs, which offers +users significant convenience in VR interactions. However, these bottom-facing +cameras can sometimes be inconvenient and pose a risk of unintentionally +exposing sensitive information, such as private body parts or personal +surroundings. To mitigate these issues, we introduce EgoHand. This system +provides an alternative solution by integrating millimeter-wave radar and IMUs +for hand gesture recognition, thereby offering users an additional option for +gesture interaction that enhances privacy protection. To accurately recognize +hand gestures, we devise a two-stage skeleton-based gesture recognition scheme. +In the first stage, a novel end-to-end Transformer architecture is employed to +estimate the coordinates of hand joints. Subsequently, these estimated joint +coordinates are utilized for gesture recognition. Extensive experiments +involving 10 subjects show that EgoHand can detect hand gestures with 90.8% +accuracy. Furthermore, EgoHand demonstrates robust performance across a variety +of cross-domain tests, including different users, dominant hands, body +postures, and scenes. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ PromptMono: Cross Prompting Attention for Self-Supervised Monocular + Depth Estimation in Challenging Environments + + +
+ Considerable efforts have been made to improve monocular depth estimation +under ideal conditions. However, in challenging environments, monocular depth +estimation still faces difficulties. In this paper, we introduce visual prompt +learning for predicting depth across different environments within a unified +model, and present a self-supervised learning framework called PromptMono. It +employs a set of learnable parameters as visual prompts to capture +domain-specific knowledge. To integrate prompting information into image +representations, a novel gated cross prompting attention (GCPA) module is +proposed, which enhances the depth estimation in diverse conditions. We +evaluate the proposed PromptMono on the Oxford Robotcar dataset and the +nuScenes dataset. Experimental results demonstrate the superior performance of +the proposed method. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Training-Free Zero-Shot Temporal Action Detection with Vision-Language + Models + + +
+ Existing zero-shot temporal action detection (ZSTAD) methods predominantly +use fully supervised or unsupervised strategies to recognize unseen activities. +However, these training-based methods are prone to domain shifts and require +high computational costs, which hinder their practical applicability in +real-world scenarios. In this paper, unlike previous works, we propose a +training-Free Zero-shot temporal Action Detection (FreeZAD) method, leveraging +existing vision-language (ViL) models to directly classify and localize unseen +activities within untrimmed videos without any additional fine-tuning or +adaptation. We mitigate the need for explicit temporal modeling and reliance on +pseudo-label quality by designing the LOGarithmic decay weighted +Outer-Inner-Contrastive Score (LogOIC) and frequency-based Actionness +Calibration. Furthermore, we introduce a test-time adaptation (TTA) strategy +using Prototype-Centric Sampling (PCS) to expand FreeZAD, enabling ViL models +to adapt more effectively for ZSTAD. Extensive experiments on the THUMOS14 and +ActivityNet-1.3 datasets demonstrate that our training-free method outperforms +state-of-the-art unsupervised methods while requiring only 1/13 of the runtime. +When equipped with TTA, the enhanced method further narrows the gap with fully +supervised methods. + +
+
+
+
+
+ + ☆ Solving the long-tailed distribution problem by exploiting the synergies + and balance of different techniques + + +
+ In real-world data, long-tailed data distribution is common, making it +challenging for models trained on empirical risk minimisation to learn and +classify tail classes effectively. While many studies have sought to improve +long tail recognition by altering the data distribution in the feature space +and adjusting model decision boundaries, research on the synergy and corrective +approach among various methods is limited. Our study delves into three +long-tail recognition techniques: Supervised Contrastive Learning (SCL), +Rare-Class Sample Generator (RSG), and Label-Distribution-Aware Margin Loss +(LDAM). SCL enhances intra-class clusters based on feature similarity and +promotes clear inter-class separability but tends to favour dominant classes +only. When RSG is integrated into the model, we observed that the intra-class +features further cluster towards the class centre, which demonstrates a +synergistic effect together with SCL's principle of enhancing intra-class +clustering. RSG generates new tail features and compensates for the tail +feature space squeezed by SCL. Similarly, LDAM is known to introduce a larger +margin specifically for tail classes; we demonstrate that LDAM further bolsters +the model's performance on tail classes when combined with the more explicit +decision boundaries achieved by SCL and RSG. Furthermore, SCL can compensate +for the dominant class accuracy sacrificed by RSG and LDAM. Our research +emphasises the synergy and balance among the three techniques, with each +amplifying the strengths of the others and mitigating their shortcomings. Our +experiment on long-tailed distribution datasets, using an end-to-end +architecture, yields competitive results by enhancing tail class accuracy +without compromising dominant class performance, achieving a balanced +improvement across all classes. + +
+
+ comment: 13 +
+
+
+
+
+ + ☆ On Disentangled Training for Nonlinear Transform in Learned Image + Compression ICLR2025 + + +
+ Learned image compression (LIC) has demonstrated superior rate-distortion +(R-D) performance compared to traditional codecs, but is challenged by training +inefficiency that could incur more than two weeks to train a state-of-the-art +model from scratch. Existing LIC methods overlook the slow convergence caused +by compacting energy in learning nonlinear transforms. In this paper, we first +reveal that such energy compaction consists of two components, i.e., feature +decorrelation and uneven energy modulation. On such basis, we propose a linear +auxiliary transform (AuxT) to disentangle energy compaction in training +nonlinear transforms. The proposed AuxT obtains coarse approximation to achieve +efficient energy compaction such that distribution fitting with the nonlinear +transforms can be simplified to fine details. We then develop wavelet-based +linear shortcuts (WLSs) for AuxT that leverages wavelet-based downsampling and +orthogonal linear projection for feature decorrelation and subband-aware +scaling for uneven energy modulation. AuxT is lightweight and plug-and-play to +be integrated into diverse LIC models to address the slow convergence issue. +Experimental results demonstrate that the proposed approach can accelerate +training of LIC models by 2 times and simultaneously achieves an average 1\% +BD-rate reduction. To our best knowledge, this is one of the first successful +attempt that can significantly improve the convergence of LIC with comparable +or superior rate-distortion performance. Code will be released at +\url{https://github.com/qingshi9974/AuxT} + +
+
+ comment: Accepted by ICLR2025 +
+
+
+
+
+ + ☆ You Only Crash Once v2: Perceptually Consistent Strong Features for + One-Stage Domain Adaptive Detection of Space Terrain + + +
+ The in-situ detection of planetary, lunar, and small-body surface terrain is +crucial for autonomous spacecraft applications, where learning-based computer +vision methods are increasingly employed to enable intelligence without prior +information or human intervention. However, many of these methods remain +computationally expensive for spacecraft processors and prevent real-time +operation. Training of such algorithms is additionally complex due to the +scarcity of labeled data and reliance on supervised learning approaches. +Unsupervised Domain Adaptation (UDA) offers a promising solution by +facilitating model training with disparate data sources such as simulations or +synthetic scenes, although UDA is difficult to apply to celestial environments +where challenging feature spaces are paramount. To alleviate such issues, You +Only Crash Once (YOCOv1) has studied the integration of Visual Similarity-based +Alignment (VSA) into lightweight one-stage object detection architectures to +improve space terrain UDA. Although proven effective, the approach faces +notable limitations, including performance degradations in multi-class and +high-altitude scenarios. Building upon the foundation of YOCOv1, we propose +novel additions to the VSA scheme that enhance terrain detection capabilities +under UDA, and our approach is evaluated across both simulated and real-world +data. Our second YOCO rendition, YOCOv2, is capable of achieving +state-of-the-art UDA performance on surface terrain detection, where we +showcase improvements upwards of 31% compared with YOCOv1 and terrestrial +state-of-the-art. We demonstrate the practical utility of YOCOv2 with +spacecraft flight hardware performance benchmarking and qualitative evaluation +of NASA mission data. + +
+
+
+
+
+ + ☆ A Mutual Information Perspective on Multiple Latent Variable Generative + Models for Positive View Generation + + +
+ In image generation, Multiple Latent Variable Generative Models (MLVGMs) +employ multiple latent variables to gradually shape the final images, from +global characteristics to finer and local details (e.g., StyleGAN, NVAE), +emerging as powerful tools for diverse applications. Yet their generative +dynamics and latent variable utilization remain only empirically observed. In +this work, we propose a novel framework to systematically quantify the impact +of each latent variable in MLVGMs, using Mutual Information (MI) as a guiding +metric. Our analysis reveals underutilized variables and can guide the use of +MLVGMs in downstream applications. + With this foundation, we introduce a method for generating synthetic data for +Self-Supervised Contrastive Representation Learning (SSCRL). By leveraging the +hierarchical and disentangled variables of MLVGMs, and guided by the previous +analysis, we apply tailored latent perturbations to produce diverse views for +SSCRL, without relying on real data altogether. + Additionally, we introduce a Continuous Sampling (CS) strategy, where the +generator dynamically creates new samples during SSCRL training, greatly +increasing data variability. Our comprehensive experiments demonstrate the +effectiveness of these contributions, showing that MLVGMs' generated views +compete on par with or even surpass views generated from real data. + This work establishes a principled approach to understanding and exploiting +MLVGMs, advancing both generative modeling and self-supervised learning. + +
+
+
+
+
+ + ☆ Skin Disease Detection and Classification of Actinic Keratosis and + Psoriasis Utilizing Deep Transfer Learning + + +
+ Skin diseases can arise from infections, allergies, genetic factors, +autoimmune disorders, hormonal imbalances, or environmental triggers such as +sun damage and pollution. Some skin diseases, such as Actinic Keratosis and +Psoriasis, can be fatal if not treated in time. Early identification is +crucial, but the diagnostic methods for these conditions are often expensive +and not widely accessible. In this study, we propose a novel and efficient +method for diagnosing skin diseases using deep learning techniques. This +approach employs a modified VGG16 Convolutional Neural Network (CNN) model. The +model includes several convolutional layers and utilizes ImageNet weights with +modified top layers. The top layer is updated with fully connected layers and a +final softmax activation layer to classify skin diseases. The dataset used, +titled "Skin Disease Dataset," is publicly available. While the VGG16 +architecture does not include data augmentation by default, preprocessing +techniques such as rotation, shifting, and zooming were applied to augment the +data prior to model training. The proposed methodology achieved 90.67% accuracy +using the modified VGG16 model, demonstrating its reliability in classifying +skin diseases. The promising results highlight the potential of this approach +for real-world applications. + +
+
+
+
+
+ + ☆ YOLO11-JDE: Fast and Accurate Multi-Object Tracking with Self-Supervised + Re-ID + + +
+ We introduce YOLO11-JDE, a fast and accurate multi-object tracking (MOT) +solution that combines real-time object detection with self-supervised +Re-Identification (Re-ID). By incorporating a dedicated Re-ID branch into +YOLO11s, our model performs Joint Detection and Embedding (JDE), generating +appearance features for each detection. The Re-ID branch is trained in a fully +self-supervised setting while simultaneously training for detection, +eliminating the need for costly identity-labeled datasets. The triplet loss, +with hard positive and semi-hard negative mining strategies, is used for +learning discriminative embeddings. Data association is enhanced with a custom +tracking implementation that successfully integrates motion, appearance, and +location cues. YOLO11-JDE achieves competitive results on MOT17 and MOT20 +benchmarks, surpassing existing JDE methods in terms of FPS and using up to ten +times fewer parameters. Thus, making our method a highly attractive solution +for real-world applications. + +
+
+ comment: This paper has been accepted to the 5th Workshop on Real-World + Surveillance: Applications and Challenges (WACV 2025) +
+
+
+
+
+ + ☆ Regularizing cross entropy loss via minimum entropy and K-L divergence + + +
+ I introduce two novel loss functions for classification in deep learning. The +two loss functions extend standard cross entropy loss by regularizing it with +minimum entropy and Kullback-Leibler (K-L) divergence terms. The first of the +two novel loss functions is termed mixed entropy loss (MIX-ENT for short), +while the second one is termed minimum entropy regularized cross-entropy loss +(MIN-ENT for short). The MIX-ENT function introduces a regularizer that can be +shown to be equivalent to the sum of a minimum entropy term and a K-L +divergence term. However, it should be noted that the K-L divergence term here +is different from that in the standard cross-entropy loss function, in the +sense that it swaps the roles of the target probability and the hypothesis +probability. The MIN-ENT function simply adds a minimum entropy regularizer to +the standard cross entropy loss function. In both MIX-ENT and MIN-ENT, the +minimum entropy regularizer minimizes the entropy of the hypothesis probability +distribution which is output by the neural network. Experiments on the +EMNIST-Letters dataset shows that my implementation of MIX-ENT and MIN-ENT lets +the VGG model climb from its previous 3rd position on the paperswithcode +leaderboard to reach the 2nd position on the leaderboard, outperforming the +Spinal-VGG model in so doing. Specifically, using standard cross-entropy, VGG +achieves 95.86% while Spinal-VGG achieves 95.88% classification accuracies, +whereas using VGG (without Spinal-VGG) our MIN-ENT achieved 95.933%, while our +MIX-ENT achieved 95.927% accuracies. The pre-trained models for both MIX-ENT +and MIN-ENT are at https://github.com/rahmanoladi/minimum entropy project. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ EventVL: Understand Event Streams via Multimodal Large Language Model + + +
+ The event-based Vision-Language Model (VLM) recently has made good progress +for practical vision tasks. However, most of these works just utilize CLIP for +focusing on traditional perception tasks, which obstruct model understanding +explicitly the sufficient semantics and context from event streams. To address +the deficiency, we propose EventVL, the first generative event-based MLLM +(Multimodal Large Language Model) framework for explicit semantic +understanding. Specifically, to bridge the data gap for connecting different +modalities semantics, we first annotate a large event-image/video-text dataset, +containing almost 1.4 million high-quality pairs of data, which enables +effective learning across various scenes, e.g., drive scene or human motion. +After that, we design Event Spatiotemporal Representation to fully explore the +comprehensive information by diversely aggregating and segmenting the event +stream. To further promote a compact semantic space, Dynamic Semantic Alignment +is introduced to improve and complete sparse semantic spaces of events. +Extensive experiments show that our EventVL can significantly surpass existing +MLLM baselines in event captioning and scene description generation tasks. We +hope our research could contribute to the development of the event vision +community. + +
+
+
+
+
+ + ☆ Training-Free Consistency Pipeline for Fashion Repose + + +
+ Recent advancements in diffusion models have significantly broadened the +possibilities for editing images of real-world objects. However, performing +non-rigid transformations, such as changing the pose of objects or image-based +conditioning, remains challenging. Maintaining object identity during these +edits is difficult, and current methods often fall short of the precision +needed for industrial applications, where consistency is critical. +Additionally, fine-tuning diffusion models requires custom training data, which +is not always accessible in real-world scenarios. This work introduces +FashionRepose, a training-free pipeline for non-rigid pose editing specifically +designed for the fashion industry. The approach integrates off-the-shelf models +to adjust poses of long-sleeve garments, maintaining identity and branding +attributes. FashionRepose uses a zero-shot approach to perform these edits in +near real-time, eliminating the need for specialized training. consistent image +editing. The solution holds potential for applications in the fashion industry +and other fields demanding identity preservation in image editing. + +
+
+
+
+
+ + ☆ Variational U-Net with Local Alignment for Joint Tumor Extraction and + Registration (VALOR-Net) of Breast MRI Data Acquired at Two Different Field + Strengths + + +
+ Background: Multiparametric breast MRI data might improve tumor diagnostics, +characterization, and treatment planning. Accurate alignment and delineation of +images acquired at different field strengths such as 3T and 7T, remain +challenging research tasks. Purpose: To address alignment challenges and enable +consistent tumor segmentation across different MRI field strengths. Study type: +Retrospective. Subjects: Nine female subjects with breast tumors were involved: +six histologically proven invasive ductal carcinomas (IDC) and three +fibroadenomas. Field strength/sequence: Imaging was performed at 3T and 7T +scanners using post-contrast T1-weighted three-dimensional time-resolved +angiography with stochastic trajectories (TWIST) sequence. Assessments: The +method's performance for joint image registration and tumor segmentation was +evaluated using several quantitative metrics, including signal-to-noise ratio +(PSNR), structural similarity index (SSIM), normalized cross-correlation (NCC), +Dice coefficient, F1 score, and relative sum of squared differences (rel SSD). +Statistical tests: The Pearson correlation coefficient was used to test the +relationship between the registration and segmentation metrics. Results: When +calculated for each subject individually, the PSNR was in a range from 27.5 to +34.5 dB, and the SSIM was from 82.6 to 92.8%. The model achieved an NCC from +96.4 to 99.3% and a Dice coefficient of 62.9 to 95.3%. The F1 score was between +55.4 and 93.2% and the rel SSD was in the range of 2.0 and 7.5%. The +segmentation metrics Dice and F1 Score are highly correlated (0.995), while a +moderate correlation between NCC and SSIM (0.681) was found for registration. +Data conclusion: Initial results demonstrate that the proposed method may be +feasible in providing joint tumor segmentation and registration of MRI data +acquired at different field strengths. + +
+
+
+
+
+ + ☆ MPG-SAM 2: Adapting SAM 2 with Mask Priors and Global Context for + Referring Video Object Segmentation + + +
+ Referring video object segmentation (RVOS) aims to segment objects in a video +according to textual descriptions, which requires the integration of multimodal +information and temporal dynamics perception. The Segment Anything Model 2 (SAM +2) has shown great effectiveness across various video segmentation tasks. +However, its application to offline RVOS is challenged by the translation of +the text into effective prompts and a lack of global context awareness. In this +paper, we propose a novel RVOS framework, termed MPG-SAM 2, to address these +challenges. Specifically, MPG-SAM 2 employs a unified multimodal encoder to +jointly encode video and textual features, generating semantically aligned +video and text embeddings, along with multimodal class tokens. A mask prior +generator utilizes the video embeddings and class tokens to create pseudo masks +of target objects and global context. These masks are fed into the prompt +encoder as dense prompts along with multimodal class tokens as sparse prompts +to generate accurate prompts for SAM 2. To provide the online SAM 2 with a +global view, we introduce a hierarchical global-historical aggregator, which +allows SAM 2 to aggregate global and historical information of target objects +at both pixel and object levels, enhancing the target representation and +temporal consistency. Extensive experiments on several RVOS benchmarks +demonstrate the superiority of MPG-SAM 2 and the effectiveness of our proposed +modules. + +
+
+
+
+
+ + ☆ Enhancing Medical Image Analysis through Geometric and Photometric + transformations + + +
+ Medical image analysis suffers from a lack of labeled data due to several +challenges including patient privacy and lack of experts. Although some AI +models only perform well with large amounts of data, we will move to data +augmentation where there is a solution to improve the performance of our models +and increase the dataset size through traditional or advanced techniques. In +this paper, we evaluate the effectiveness of data augmentation techniques on +two different medical image datasets. In the first step, we applied some +transformation techniques to the skin cancer dataset containing benign and +malignant classes. Then, we trained the convolutional neural network (CNN) on +the dataset before and after augmentation, which significantly improved test +accuracy from 90.74% to 96.88% and decreased test loss from 0.7921 to 0.1468 +after augmentation. In the second step, we used the Mixup technique by mixing +two random images and their corresponding masks using the retina and blood +vessels dataset, then we trained the U-net model and obtained the Dice +coefficient which increased from 0 before augmentation to 0.4163 after +augmentation. The result shows the effect of using data augmentation to +increase the dataset size on the classification and segmentation performance. + +
+
+
+
+
+ + ☆ QMamba: Post-Training Quantization for Vision State Space Models + + +
+ State Space Models (SSMs), as key components of Mamaba, have gained +increasing attention for vision models recently, thanks to their efficient long +sequence modeling capability. Given the computational cost of deploying SSMs on +resource-limited edge devices, Post-Training Quantization (PTQ) is a technique +with the potential for efficient deployment of SSMs. In this work, we propose +QMamba, one of the first PTQ frameworks to our knowledge, designed for vision +SSMs based on the analysis of the activation distributions in SSMs. We reveal +that the distribution of discrete parameters exhibits long-tailed skewness and +the distribution of the hidden state sequence exhibits highly dynamic +variations. Correspondingly, we design Long-tailed Skewness Quantization (LtSQ) +to quantize discrete parameters and Temporal Group Quantization (TGQ) to +quantize hidden states, which reduces the quantization errors. Extensive +experiments demonstrate that QMamba outperforms advanced PTQ methods on vision +models across multiple model sizes and architectures. Notably, QMamba surpasses +existing methods by 21.0% on ImageNet classification with 4-bit activations. + +
+
+
+
+
+ + ☆ Cognitive Paradigms for Evaluating VLMs on Visual Reasoning Task + + +
+ Evaluating the reasoning capabilities of Vision-Language Models (VLMs) in +complex visual tasks provides valuable insights into their potential and +limitations. In this work, we assess the performance of VLMs on the challenging +Bongard Openworld Problems benchmark, which involves reasoning over natural +images. We propose and evaluate three human-inspired paradigms: holistic +analysis (global context processing), deductive rule learning (explicit rule +derivation and application), and componential analysis (structured +decomposition of images into components). Our results demonstrate that +state-of-the-art models, including GPT-4o and Gemini, not only surpass human +benchmarks but also excel in structured reasoning tasks, with componential +analysis proving especially effective. However, ablation studies reveal key +challenges, such as handling synthetic images, making fine-grained +distinctions, and interpreting nuanced contextual information. These insights +underscore the need for further advancements in model robustness and +generalization, while highlighting the transformative potential of structured +reasoning approaches in enhancing VLM capabilities. + +
+
+
+
+
+ + ☆ Black-Box Adversarial Attack on Vision Language Models for Autonomous + Driving + + +
+ Vision-language models (VLMs) have significantly advanced autonomous driving +(AD) by enhancing reasoning capabilities; however, these models remain highly +susceptible to adversarial attacks. While existing research has explored +white-box attacks to some extent, the more practical and challenging black-box +scenarios remain largely underexplored due to their inherent difficulty. In +this paper, we take the first step toward designing black-box adversarial +attacks specifically targeting VLMs in AD. We identify two key challenges for +achieving effective black-box attacks in this context: the effectiveness across +driving reasoning chains in AD systems and the dynamic nature of driving +scenarios. To address this, we propose Cascading Adversarial Disruption (CAD). +It first introduces Decision Chain Disruption, which targets low-level +reasoning breakdown by generating and injecting deceptive semantics, ensuring +the perturbations remain effective across the entire decision-making chain. +Building on this, we present Risky Scene Induction, which addresses dynamic +adaptation by leveraging a surrogate VLM to understand and construct high-level +risky scenarios that are likely to result in critical errors in the current +driving contexts. Extensive experiments conducted on multiple AD VLMs and +benchmarks demonstrate that CAD achieves state-of-the-art attack effectiveness, +significantly outperforming existing methods (+13.43% on average). Moreover, we +validate its practical applicability through real-world attacks on AD vehicles +powered by VLMs, where the route completion rate drops by 61.11% and the +vehicle crashes directly into the obstacle vehicle with adversarial patches. +Finally, we release CADA dataset, comprising 18,808 adversarial +visual-question-answer pairs, to facilitate further evaluation and research in +this critical domain. Our codes and dataset will be available after paper's +acceptance. + +
+
+
+
+
+ + ☆ GoDe: Gaussians on Demand for Progressive Level of Detail and Scalable + Compression + + +
+ 3D Gaussian Splatting enhances real-time performance in novel view synthesis +by representing scenes with mixtures of Gaussians and utilizing differentiable +rasterization. However, it typically requires large storage capacity and high +VRAM, demanding the design of effective pruning and compression techniques. +Existing methods, while effective in some scenarios, struggle with scalability +and fail to adapt models based on critical factors such as computing +capabilities or bandwidth, requiring to re-train the model under different +configurations. In this work, we propose a novel, model-agnostic technique that +organizes Gaussians into several hierarchical layers, enabling progressive +Level of Detail (LoD) strategy. This method, combined with recent approach of +compression of 3DGS, allows a single model to instantly scale across several +compression ratios, with minimal to none impact to quality compared to a single +non-scalable model and without requiring re-training. We validate our approach +on typical datasets and benchmarks, showcasing low distortion and substantial +gains in terms of scalability and adaptability. + +
+
+
+
+
+ + ☆ One-Prompt-One-Story: Free-Lunch Consistent Text-to-Image Generation + Using a Single Prompt + + +
+ Text-to-image generation models can create high-quality images from input +prompts. However, they struggle to support the consistent generation of +identity-preserving requirements for storytelling. Existing approaches to this +problem typically require extensive training in large datasets or additional +modifications to the original model architectures. This limits their +applicability across different domains and diverse diffusion model +configurations. In this paper, we first observe the inherent capability of +language models, coined context consistency, to comprehend identity through +context with a single prompt. Drawing inspiration from the inherent context +consistency, we propose a novel training-free method for consistent +text-to-image (T2I) generation, termed "One-Prompt-One-Story" (1Prompt1Story). +Our approach 1Prompt1Story concatenates all prompts into a single input for T2I +diffusion models, initially preserving character identities. We then refine the +generation process using two novel techniques: Singular-Value Reweighting and +Identity-Preserving Cross-Attention, ensuring better alignment with the input +description for each frame. In our experiments, we compare our method against +various existing consistent T2I generation approaches to demonstrate its +effectiveness through quantitative metrics and qualitative assessments. Code is +available at https://github.com/byliutao/1Prompt1Story. + +
+
+
+
+
+ + ☆ ReasVQA: Advancing VideoQA with Imperfect Reasoning Process + + +
+ Video Question Answering (VideoQA) is a challenging task that requires +understanding complex visual and temporal relationships within videos to answer +questions accurately. In this work, we introduce \textbf{ReasVQA} +(Reasoning-enhanced Video Question Answering), a novel approach that leverages +reasoning processes generated by Multimodal Large Language Models (MLLMs) to +improve the performance of VideoQA models. Our approach consists of three +phases: reasoning generation, reasoning refinement, and learning from +reasoning. First, we generate detailed reasoning processes using additional +MLLMs, and second refine them via a filtering step to ensure data quality. +Finally, we use the reasoning data, which might be in an imperfect form, to +guide the VideoQA model via multi-task learning, on how to interpret and answer +questions based on a given video. We evaluate ReasVQA on three popular +benchmarks, and our results establish new state-of-the-art performance with +significant improvements of +2.9 on NExT-QA, +7.3 on STAR, and +5.9 on +IntentQA. Our findings demonstrate the supervising benefits of integrating +reasoning processes into VideoQA. Further studies validate each component of +our method, also with different backbones and MLLMs, and again highlight the +advantages of this simple but effective method. We offer a new perspective on +enhancing VideoQA performance by utilizing advanced reasoning techniques, +setting a new benchmark in this research field. + +
+
+ comment: Accepted to main conference at NAACL 2025; 8 pages; +
+
+
+
+
+ + ☆ Overcoming Support Dilution for Robust Few-shot Semantic Segmentation + + +
+ Few-shot Semantic Segmentation (FSS) is a challenging task that utilizes +limited support images to segment associated unseen objects in query images. +However, recent FSS methods are observed to perform worse, when enlarging the +number of shots. As the support set enlarges, existing FSS networks struggle to +concentrate on the high-contributed supports and could easily be overwhelmed by +the low-contributed supports that could severely impair the mask predictions. +In this work, we study this challenging issue, called support dilution, our +goal is to recognize, select, preserve, and enhance those high-contributed +supports in the raw support pool. Technically, our method contains three novel +parts. First, we propose a contribution index, to quantitatively estimate if a +high-contributed support dilutes. Second, we develop the Symmetric Correlation +(SC) module to preserve and enhance the high-contributed support features, +minimizing the distraction by the low-contributed features. Third, we design +the Support Image Pruning operation, to retrieve a compact and high quality +subset by discarding low-contributed supports. We conduct extensive experiments +on two FSS benchmarks, COCO-20i and PASCAL-5i, the segmentation results +demonstrate the compelling performance of our solution over state-of-the-art +FSS approaches. Besides, we apply our solution for online segmentation and +real-world segmentation, convincing segmentation results showing the practical +ability of our work for real-world demonstrations. + +
+
+ comment: 15 pages, 15 figures +
+
+
+
+
+ + ☆ Diffusion-based Perceptual Neural Video Compression with Temporal + Diffusion Information Reuse + + +
+ Recently, foundational diffusion models have attracted considerable attention +in image compression tasks, whereas their application to video compression +remains largely unexplored. In this article, we introduce DiffVC, a +diffusion-based perceptual neural video compression framework that effectively +integrates foundational diffusion model with the video conditional coding +paradigm. This framework uses temporal context from previously decoded frame +and the reconstructed latent representation of the current frame to guide the +diffusion model in generating high-quality results. To accelerate the iterative +inference process of diffusion model, we propose the Temporal Diffusion +Information Reuse (TDIR) strategy, which significantly enhances inference +efficiency with minimal performance loss by reusing the diffusion information +from previous frames. Additionally, to address the challenges posed by +distortion differences across various bitrates, we propose the Quantization +Parameter-based Prompting (QPP) mechanism, which utilizes quantization +parameters as prompts fed into the foundational diffusion model to explicitly +modulate intermediate features, thereby enabling a robust variable bitrate +diffusion-based neural compression framework. Experimental results demonstrate +that our proposed solution delivers excellent performance in both perception +metrics and visual quality. + +
+
+
+
+
+ + ☆ Text-driven Online Action Detection + + +
+ Detecting actions as they occur is essential for applications like video +surveillance, autonomous driving, and human-robot interaction. Known as online +action detection, this task requires classifying actions in streaming videos, +handling background noise, and coping with incomplete actions. Transformer +architectures are the current state-of-the-art, yet the potential of recent +advancements in computer vision, particularly vision-language models (VLMs), +remains largely untapped for this problem, partly due to high computational +costs. In this paper, we introduce TOAD: a Text-driven Online Action Detection +architecture that supports zero-shot and few-shot learning. TOAD leverages CLIP +(Contrastive Language-Image Pretraining) textual embeddings, enabling efficient +use of VLMs without significant computational overhead. Our model achieves +82.46% mAP on the THUMOS14 dataset, outperforming existing methods, and sets +new baselines for zero-shot and few-shot performance on the THUMOS14 and +TVSeries datasets. + +
+
+ comment: Published in Integrated Computer-Aided Engineering +
+
+
+
+
+ + ☆ Propensity-driven Uncertainty Learning for Sample Exploration in + Source-Free Active Domain Adaptation + + +
+ Source-free active domain adaptation (SFADA) addresses the challenge of +adapting a pre-trained model to new domains without access to source data while +minimizing the need for target domain annotations. This scenario is +particularly relevant in real-world applications where data privacy, storage +limitations, or labeling costs are significant concerns. Key challenges in +SFADA include selecting the most informative samples from the target domain for +labeling, effectively leveraging both labeled and unlabeled target data, and +adapting the model without relying on source domain information. Additionally, +existing methods often struggle with noisy or outlier samples and may require +impractical progressive labeling during training. To effectively select more +informative samples without frequently requesting human annotations, we propose +the Propensity-driven Uncertainty Learning (ProULearn) framework. ProULearn +utilizes a novel homogeneity propensity estimation mechanism combined with +correlation index calculation to evaluate feature-level relationships. This +approach enables the identification of representative and challenging samples +while avoiding noisy outliers. Additionally, we develop a central correlation +loss to refine pseudo-labels and create compact class distributions during +adaptation. In this way, ProULearn effectively bridges the domain gap and +maximizes adaptation performance. The principles of informative sample +selection underlying ProULearn have broad implications beyond SFADA, offering +benefits across various deep learning tasks where identifying key data points +or features is crucial. Extensive experiments on four benchmark datasets +demonstrate that ProULearn outperforms state-of-the-art methods in domain +adaptation scenarios. + +
+
+
+
+
+ + ☆ Self-Supervised Diffusion MRI Denoising via Iterative and Stable + Refinement + + +
+ Magnetic Resonance Imaging (MRI), including diffusion MRI (dMRI), serves as a +``microscope'' for anatomical structures and routinely mitigates the influence +of low signal-to-noise ratio scans by compromising temporal or spatial +resolution. However, these compromises fail to meet clinical demands for both +efficiency and precision. Consequently, denoising is a vital preprocessing +step, particularly for dMRI, where clean data is unavailable. In this paper, we +introduce Di-Fusion, a fully self-supervised denoising method that leverages +the latter diffusion steps and an adaptive sampling process. Unlike previous +approaches, our single-stage framework achieves efficient and stable training +without extra noise model training and offers adaptive and controllable results +in the sampling process. Our thorough experiments on real and simulated data +demonstrate that Di-Fusion achieves state-of-the-art performance in +microstructure modeling, tractography tracking, and other downstream tasks. + +
+
+ comment: 39pages, 34figures +
+
+
+
+
+ + ☆ Quantized Spike-driven Transformer ICLR 2025 + + +
+ Spiking neural networks are emerging as a promising energy-efficient +alternative to traditional artificial neural networks due to their spike-driven +paradigm. However, recent research in the SNN domain has mainly focused on +enhancing accuracy by designing large-scale Transformer structures, which +typically rely on substantial computational resources, limiting their +deployment on resource-constrained devices. To overcome this challenge, we +propose a quantized spike-driven Transformer baseline (QSD-Transformer), which +achieves reduced resource demands by utilizing a low bit-width parameter. +Regrettably, the QSD-Transformer often suffers from severe performance +degradation. In this paper, we first conduct empirical analysis and find that +the bimodal distribution of quantized spike-driven self-attention (Q-SDSA) +leads to spike information distortion (SID) during quantization, causing +significant performance degradation. To mitigate this issue, we take +inspiration from mutual information entropy and propose a bi-level optimization +strategy to rectify the information distribution in Q-SDSA. Specifically, at +the lower level, we introduce an information-enhanced LIF to rectify the +information distribution in Q-SDSA. At the upper level, we propose a +fine-grained distillation scheme for the QSD-Transformer to align the +distribution in Q-SDSA with that in the counterpart ANN. By integrating the +bi-level optimization strategy, the QSD-Transformer can attain enhanced energy +efficiency without sacrificing its high-performance advantage.For instance, +when compared to the prior SNN benchmark on ImageNet, the QSD-Transformer +achieves 80.3\% top-1 accuracy, accompanied by significant reductions of +6.0$\times$ and 8.1$\times$ in power consumption and model size, respectively. +Code is available at https://github.com/bollossom/QSD-Transformer. + +
+
+ comment: Accepted by ICLR 2025 +
+
+
+
+
+ + ☆ LDR-Net: A Novel Framework for AI-generated Image Detection via + Localized Discrepancy Representation + + +
+ With the rapid advancement of generative models, the visual quality of +generated images has become nearly indistinguishable from the real ones, posing +challenges to content authenticity verification. Existing methods for detecting +AI-generated images primarily focus on specific forgery clues, which are often +tailored to particular generative models like GANs or diffusion models. These +approaches struggle to generalize across architectures. Building on the +observation that generative images often exhibit local anomalies, such as +excessive smoothness, blurred textures, and unnatural pixel variations in small +regions, we propose the localized discrepancy representation network (LDR-Net), +a novel approach for detecting AI-generated images. LDR-Net captures smoothing +artifacts and texture irregularities, which are common but often overlooked. It +integrates two complementary modules: local gradient autocorrelation (LGA) +which models local smoothing anomalies to detect smoothing anomalies, and local +variation pattern (LVP) which captures unnatural regularities by modeling the +complexity of image patterns. By merging LGA and LVP features, a comprehensive +representation of localized discrepancies can be provided. Extensive +experiments demonstrate that our LDR-Net achieves state-of-the-art performance +in detecting generated images and exhibits satisfactory generalization across +unseen generative models. The code will be released upon acceptance of this +paper. + +
+
+
+
+
+ + ☆ Leveraging Textual Anatomical Knowledge for Class-Imbalanced + Semi-Supervised Multi-Organ Segmentation + + +
+ Annotating 3D medical images demands substantial time and expertise, driving +the adoption of semi-supervised learning (SSL) for segmentation tasks. However, +the complex anatomical structures of organs often lead to significant class +imbalances, posing major challenges for deploying SSL in real-world scenarios. +Despite the availability of valuable prior information, such as inter-organ +relative positions and organ shape priors, existing SSL methods have yet to +fully leverage these insights. To address this gap, we propose a novel approach +that integrates textual anatomical knowledge (TAK) into the segmentation model. +Specifically, we use GPT-4o to generate textual descriptions of anatomical +priors, which are then encoded using a CLIP-based model. These encoded priors +are injected into the segmentation model as parameters of the segmentation +head. Additionally, contrastive learning is employed to enhance the alignment +between textual priors and visual features. Extensive experiments demonstrate +the superior performance of our method, significantly surpassing +state-of-the-art approaches. The source code will be available at: +https://github.com/Lunn88/TAK-Semi. + +
+
+
+
+
+ + ☆ Streaming Video Understanding and Multi-round Interaction with + Memory-enhanced Knowledge ICLR 2025 + + +
+ Recent advances in Large Language Models (LLMs) have enabled the development +of Video-LLMs, advancing multimodal learning by bridging video data with +language tasks. However, current video understanding models struggle with +processing long video sequences, supporting multi-turn dialogues, and adapting +to real-world dynamic scenarios. To address these issues, we propose +StreamChat, a training-free framework for streaming video reasoning and +conversational interaction. $\StreamChat$ leverages a novel hierarchical memory +system to efficiently process and compress video features over extended +sequences, enabling real-time, multi-turn dialogue. Our framework incorporates +a parallel system scheduling strategy that enhances processing speed and +reduces latency, ensuring robust performance in real-world applications. +Furthermore, we introduce StreamBench, a versatile benchmark that evaluates +streaming video understanding across diverse media types and interactive +scenarios, including multi-turn interactions and complex reasoning tasks. +Extensive evaluations on StreamBench and other public benchmarks demonstrate +that StreamChat significantly outperforms existing state-of-the-art models in +terms of accuracy and response times, confirming its effectiveness for +streaming video understanding. Code is available at StreamChat: +https://github.com/hmxiong/StreamChat. + +
+
+ comment: Accepted to ICLR 2025. Code is available at + https://github.com/hmxiong/StreamChat +
+
+
+
+
+ + ☆ Knowledge-Informed Multi-Agent Trajectory Prediction at Signalized + Intersections for Infrastructure-to-Everything + + +
+ Multi-agent trajectory prediction at signalized intersections is crucial for +developing efficient intelligent transportation systems and safe autonomous +driving systems. Due to the complexity of intersection scenarios and the +limitations of single-vehicle perception, the performance of vehicle-centric +prediction methods has reached a plateau. Furthermore, most works underutilize +critical intersection information, including traffic signals, and behavior +patterns induced by road structures. Therefore, we propose a multi-agent +trajectory prediction framework at signalized intersections dedicated to +Infrastructure-to-Everything (I2XTraj). Our framework leverages dynamic graph +attention to integrate knowledge from traffic signals and driving behaviors. A +continuous signal-informed mechanism is proposed to adaptively process +real-time traffic signals from infrastructure devices. Additionally, leveraging +the prior knowledge of the intersection topology, we propose a driving strategy +awareness mechanism to model the joint distribution of goal intentions and +maneuvers. To the best of our knowledge, I2XTraj represents the first +multi-agent trajectory prediction framework explicitly designed for +infrastructure deployment, supplying subscribable prediction services to all +vehicles at intersections. I2XTraj demonstrates state-of-the-art performance on +both the Vehicle-to-Infrastructure dataset V2X-Seq and the aerial-view dataset +SinD for signalized intersections. Quantitative evaluations show that our +approach outperforms existing methods by more than 30% in both multi-agent and +single-agent scenarios. + +
+
+
+
+
+ + ☆ EchoVideo: Identity-Preserving Human Video Generation by Multimodal + Feature Fusion + + +
+ Recent advancements in video generation have significantly impacted various +downstream applications, particularly in identity-preserving video generation +(IPT2V). However, existing methods struggle with "copy-paste" artifacts and low +similarity issues, primarily due to their reliance on low-level facial image +information. This dependence can result in rigid facial appearances and +artifacts reflecting irrelevant details. To address these challenges, we +propose EchoVideo, which employs two key strategies: (1) an Identity Image-Text +Fusion Module (IITF) that integrates high-level semantic features from text, +capturing clean facial identity representations while discarding occlusions, +poses, and lighting variations to avoid the introduction of artifacts; (2) a +two-stage training strategy, incorporating a stochastic method in the second +phase to randomly utilize shallow facial information. The objective is to +balance the enhancements in fidelity provided by shallow features while +mitigating excessive reliance on them. This strategy encourages the model to +utilize high-level features during training, ultimately fostering a more robust +representation of facial identities. EchoVideo effectively preserves facial +identities and maintains full-body integrity. Extensive experiments demonstrate +that it achieves excellent results in generating high-quality, controllability +and fidelity videos. + +
+
+
+
+
+ + ☆ MultiDreamer3D: Multi-concept 3D Customization with Concept-Aware + Diffusion Guidance + + +
+ While single-concept customization has been studied in 3D, multi-concept +customization remains largely unexplored. To address this, we propose +MultiDreamer3D that can generate coherent multi-concept 3D content in a +divide-and-conquer manner. First, we generate 3D bounding boxes using an +LLM-based layout controller. Next, a selective point cloud generator creates +coarse point clouds for each concept. These point clouds are placed in the 3D +bounding boxes and initialized into 3D Gaussian Splatting with concept labels, +enabling precise identification of concept attributions in 2D projections. +Finally, we refine 3D Gaussians via concept-aware interval score matching, +guided by concept-aware diffusion. Our experimental results show that +MultiDreamer3D not only ensures object presence and preserves the distinct +identities of each concept but also successfully handles complex cases such as +property change or interaction. To the best of our knowledge, we are the first +to address the multi-concept customization in 3D. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ One-cycle Structured Pruning with Stability Driven Structure Search + + +
+ Existing structured pruning typically involves multi-stage training +procedures that often demand heavy computation. Pruning at initialization, +which aims to address this limitation, reduces training costs but struggles +with performance. To address these challenges, we propose an efficient +framework for one-cycle structured pruning without compromising model +performance. In this approach, we integrate pre-training, pruning, and +fine-tuning into a single training cycle, referred to as the `one cycle +approach'. The core idea is to search for the optimal sub-network during the +early stages of network training, guided by norm-based group saliency criteria +and structured sparsity regularization. We introduce a novel pruning indicator +that determines the stable pruning epoch by assessing the similarity between +evolving pruning sub-networks across consecutive training epochs. Also, group +sparsity regularization helps to accelerate the pruning process and results in +speeding up the entire process. Extensive experiments on datasets, including +CIFAR-10/100, and ImageNet, using VGGNet, ResNet, MobileNet, and ViT +architectures, demonstrate that our method achieves state-of-the-art accuracy +while being one of the most efficient pruning frameworks in terms of training +time. The source code will be made publicly available. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ GC-ConsFlow: Leveraging Optical Flow Residuals and Global Context for + Robust Deepfake Detection + + +
+ The rapid development of Deepfake technology has enabled the generation of +highly realistic manipulated videos, posing severe social and ethical +challenges. Existing Deepfake detection methods primarily focused on either +spatial or temporal inconsistencies, often neglecting the interplay between the +two or suffering from interference caused by natural facial motions. To address +these challenges, we propose the global context consistency flow (GC-ConsFlow), +a novel dual-stream framework that effectively integrates spatial and temporal +features for robust Deepfake detection. The global grouped context aggregation +module (GGCA), integrated into the global context-aware frame flow stream +(GCAF), enhances spatial feature extraction by aggregating grouped global +context information, enabling the detection of subtle, spatial artifacts within +frames. The flow-gradient temporal consistency stream (FGTC), rather than +directly modeling the residuals, it is used to improve the robustness of +temporal feature extraction against the inconsistency introduced by unnatural +facial motion using optical flow residuals and gradient-based features. By +combining these two streams, GC-ConsFlow demonstrates the effectiveness and +robustness in capturing complementary spatiotemporal forgery traces. Extensive +experiments show that GC-ConsFlow outperforms existing state-of-the-art methods +in detecting Deepfake videos under various compression scenarios. + +
+
+
+
+
+ + ☆ Emotion estimation from video footage with LSTM + + +
+ Emotion estimation in general is a field that has been studied for a long +time, and several approaches exist using machine learning. in this paper, we +present an LSTM model, that processes the blend-shapes produced by the library +MediaPipe, for a face detected in a live stream of a camera, to estimate the +main emotion from the facial expressions, this model is trained on the FER2013 +dataset and delivers a result of 71% accuracy and 62% f1-score which meets the +accuracy benchmark of the FER2013 dataset, with significantly reduced +computation costs. https://github.com/ +Samir-atra/Emotion_estimation_from_video_footage_with_LSTM_ML_algorithm + +
+
+ comment: 11 pages, 6 figures, 32 references, 4 tables +
+
+
+
+
+ + ☆ Auto-Prompting SAM for Weakly Supervised Landslide Extraction + + +
+ Weakly supervised landslide extraction aims to identify landslide regions +from remote sensing data using models trained with weak labels, particularly +image-level labels. However, it is often challenged by the imprecise boundaries +of the extracted objects due to the lack of pixel-wise supervision and the +properties of landslide objects. To tackle these issues, we propose a simple +yet effective method by auto-prompting the Segment Anything Model (SAM), i.e., +APSAM. Instead of depending on high-quality class activation maps (CAMs) for +pseudo-labeling or fine-tuning SAM, our method directly yields fine-grained +segmentation masks from SAM inference through prompt engineering. Specifically, +it adaptively generates hybrid prompts from the CAMs obtained by an object +localization network. To provide sufficient information for SAM prompting, an +adaptive prompt generation (APG) algorithm is designed to fully leverage the +visual patterns of CAMs, enabling the efficient generation of pseudo-masks for +landslide extraction. These informative prompts are able to identify the extent +of landslide areas (box prompts) and denote the centers of landslide objects +(point prompts), guiding SAM in landslide segmentation. Experimental results on +high-resolution aerial and satellite datasets demonstrate the effectiveness of +our method, achieving improvements of at least 3.0\% in F1 score and 3.69\% in +IoU compared to other state-of-the-art methods. The source codes and datasets +will be available at https://github.com/zxk688. + +
+
+ comment: 5 pages, 5 figures +
+
+
+
+
+ + ☆ Atmospheric Noise-Resilient Image Classification in a Real-World + Scenario: Using Hybrid CNN and Pin-GTSVM + + +
+ Parking space occupation detection using deep learning frameworks has seen +significant advancements over the past few years. While these approaches +effectively detect partial obstructions and adapt to varying lighting +conditions, their performance significantly diminishes when haze is present. +This paper proposes a novel hybrid model with a pre-trained feature extractor +and a Pinball Generalized Twin Support Vector Machine (Pin-GTSVM) classifier, +which removes the need for a dehazing system from the current State-of-The-Art +hazy parking slot classification systems and is also insensitive to any +atmospheric noise. The proposed system can seamlessly integrate with +conventional smart parking infrastructures, leveraging a minimal number of +cameras to monitor and manage hundreds of parking spaces efficiently. Its +effectiveness has been evaluated against established parking space detection +methods using the CNRPark Patches, PKLot, and a custom dataset specific to hazy +parking scenarios. Furthermore, empirical results indicate a significant +improvement in accuracy on a hazy parking system, thus emphasizing efficient +atmospheric noise handling. + +
+
+
+
+
+ + ☆ LVFace: Large Vision model for Face Recogniton + + +
+ Recently, large vision models have demonstrated powerful representation +capabilities in the field of computer vision. However, we unexpectedly found +that face recognition research is still mainly focused on CNN-based model +architectures, which may lead to suboptimal state-of-the-art (SOTA) performance +in face recognition. Therefore, we study how to use various loss functions from +historical research orthogonally to train a new state-of-the-art face +recognition model based on large vision models, called LVFace. On the largest +public face database, WebFace42M, we demonstrated the superiority of LVFace +over other advanced face recognition methods and achieved first place in the +ICCV21 MFR-Ongoing challenge, until the submission of this work (December 30, +2024, academic track). + +
+
+
+
+
+ + ☆ Rethinking the Sample Relations for Few-Shot Classification + + +
+ Feature quality is paramount for classification performance, particularly in +few-shot scenarios. Contrastive learning, a widely adopted technique for +enhancing feature quality, leverages sample relations to extract intrinsic +features that capture semantic information and has achieved remarkable success +in Few-Shot Learning (FSL). Nevertheless, current few-shot contrastive learning +approaches often overlook the semantic similarity discrepancies at different +granularities when employing the same modeling approach for different sample +relations, which limits the potential of few-shot contrastive learning. In this +paper, we introduce a straightforward yet effective contrastive learning +approach, Multi-Grained Relation Contrastive Learning (MGRCL), as a +pre-training feature learning model to boost few-shot learning by meticulously +modeling sample relations at different granularities. MGRCL categorizes sample +relations into three types: intra-sample relation of the same sample under +different transformations, intra-class relation of homogenous samples, and +inter-class relation of inhomogeneous samples. In MGRCL, we design +Transformation Consistency Learning (TCL) to ensure the rigorous semantic +consistency of a sample under different transformations by aligning predictions +of input pairs. Furthermore, to preserve discriminative information, we employ +Class Contrastive Learning (CCL) to ensure that a sample is always closer to +its homogenous samples than its inhomogeneous ones, as homogenous samples share +similar semantic content while inhomogeneous samples have different semantic +content. Our method is assessed across four popular FSL benchmarks, showing +that such a simple pre-training feature learning method surpasses a majority of +leading FSL methods. Moreover, our method can be incorporated into other FSL +methods as the pre-trained model and help them obtain significant performance +gains. + +
+
+ comment: 32 pages +
+
+
+
+
+ + ☆ GeomGS: LiDAR-Guided Geometry-Aware Gaussian Splatting for Robot + Localization + + +
+ Mapping and localization are crucial problems in robotics and autonomous +driving. Recent advances in 3D Gaussian Splatting (3DGS) have enabled precise +3D mapping and scene understanding by rendering photo-realistic images. +However, existing 3DGS methods often struggle to accurately reconstruct a 3D +map that reflects the actual scale and geometry of the real world, which +degrades localization performance. To address these limitations, we propose a +novel 3DGS method called Geometry-Aware Gaussian Splatting (GeomGS). This +method fully integrates LiDAR data into 3D Gaussian primitives via a +probabilistic approach, as opposed to approaches that only use LiDAR as initial +points or introduce simple constraints for Gaussian points. To this end, we +introduce a Geometric Confidence Score (GCS), which identifies the structural +reliability of each Gaussian point. The GCS is optimized simultaneously with +Gaussians under probabilistic distance constraints to construct a precise +structure. Furthermore, we propose a novel localization method that fully +utilizes both the geometric and photometric properties of GeomGS. Our GeomGS +demonstrates state-of-the-art geometric and localization performance across +several benchmarks, while also improving photometric performance. + +
+
+ comment: Preprint, Under review +
+
+
+
+
+ + ☆ VIGS SLAM: IMU-based Large-Scale 3D Gaussian Splatting SLAM + + +
+ Recently, map representations based on radiance fields such as 3D Gaussian +Splatting and NeRF, which excellent for realistic depiction, have attracted +considerable attention, leading to attempts to combine them with SLAM. While +these approaches can build highly realistic maps, large-scale SLAM still +remains a challenge because they require a large number of Gaussian images for +mapping and adjacent images as keyframes for tracking. We propose a novel 3D +Gaussian Splatting SLAM method, VIGS SLAM, that utilizes sensor fusion of RGB-D +and IMU sensors for large-scale indoor environments. To reduce the +computational load of 3DGS-based tracking, we adopt an ICP-based tracking +framework that combines IMU preintegration to provide a good initial guess for +accurate pose estimation. Our proposed method is the first to propose that +Gaussian Splatting-based SLAM can be effectively performed in large-scale +environments by integrating IMU sensor measurements. This proposal not only +enhances the performance of Gaussian Splatting SLAM beyond room-scale scenarios +but also achieves SLAM performance comparable to state-of-the-art methods in +large-scale indoor environments. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ YOLOv8 to YOLO11: A Comprehensive Architecture In-depth Comparative + Review + + +
+ In the field of deep learning-based computer vision, YOLO is revolutionary. +With respect to deep learning models, YOLO is also the one that is evolving the +most rapidly. Unfortunately, not every YOLO model possesses scholarly +publications. Moreover, there exists a YOLO model that lacks a publicly +accessible official architectural diagram. Naturally, this engenders +challenges, such as complicating the understanding of how the model operates in +practice. Furthermore, the review articles that are presently available do not +delve into the specifics of each model. The objective of this study is to +present a comprehensive and in-depth architecture comparison of the four most +recent YOLO models, specifically YOLOv8 through YOLO11, thereby enabling +readers to quickly grasp not only how each model functions, but also the +distinctions between them. To analyze each YOLO version's architecture, we +meticulously examined the relevant academic papers, documentation, and +scrutinized the source code. The analysis reveals that while each version of +YOLO has improvements in architecture and feature extraction, certain blocks +remain unchanged. The lack of scholarly publications and official diagrams +presents challenges for understanding the model's functionality and future +enhancement. Future developers are encouraged to provide these resources. + +
+
+ comment: submitted to Journal of Applied Engineering and Technological Science +
+
+
+
+
+ + ☆ Towards Intelligent Design: A Self-driven Framework for Collocated + Clothing Synthesis Leveraging Fashion Styles and Textures + + +
+ Collocated clothing synthesis (CCS) has emerged as a pivotal topic in fashion +technology, primarily concerned with the generation of a clothing item that +harmoniously matches a given item. However, previous investigations have relied +on using paired outfits, such as a pair of matching upper and lower clothing, +to train a generative model for achieving this task. This reliance on the +expertise of fashion professionals in the construction of such paired outfits +has engendered a laborious and time-intensive process. In this paper, we +introduce a new self-driven framework, named style- and texture-guided +generative network (ST-Net), to synthesize collocated clothing without the +necessity for paired outfits, leveraging self-supervised learning. ST-Net is +designed to extrapolate fashion compatibility rules from the style and texture +attributes of clothing, using a generative adversarial network. To facilitate +the training and evaluation of our model, we have constructed a large-scale +dataset specifically tailored for unsupervised CCS. Extensive experiments +substantiate that our proposed method outperforms the state-of-the-art +baselines in terms of both visual authenticity and fashion compatibility. + +
+
+ comment: This paper has been accepted for presentation at ICASSP 2024 +
+
+
+
+
+ + ☆ AEON: Adaptive Estimation of Instance-Dependent In-Distribution and + Out-of-Distribution Label Noise for Robust Learning + + +
+ Robust training with noisy labels is a critical challenge in image +classification, offering the potential to reduce reliance on costly clean-label +datasets. Real-world datasets often contain a mix of in-distribution (ID) and +out-of-distribution (OOD) instance-dependent label noise, a challenge that is +rarely addressed simultaneously by existing methods and is further compounded +by the lack of comprehensive benchmarking datasets. Furthermore, even though +current noisy-label learning approaches attempt to find noisy-label samples +during training, these methods do not aim to estimate ID and OOD noise rates to +promote their effectiveness in the selection of such noisy-label samples, and +they are often represented by inefficient multi-stage learning algorithms. We +propose the Adaptive Estimation of Instance-Dependent In-Distribution and +Out-of-Distribution Label Noise (AEON) approach to address these research gaps. +AEON is an efficient one-stage noisy-label learning methodology that +dynamically estimates instance-dependent ID and OOD label noise rates to +enhance robustness to complex noise settings. Additionally, we introduce a new +benchmark reflecting real-world ID and OOD noise scenarios. Experiments +demonstrate that AEON achieves state-of-the-art performance on both synthetic +and real-world datasets + +
+
+ comment: In Submission +
+
+
+
+
+ + ☆ From Images to Point Clouds: An Efficient Solution for Cross-media Blind + Quality Assessment without Annotated Training + + +
+ We present a novel quality assessment method which can predict the perceptual +quality of point clouds from new scenes without available annotations by +leveraging the rich prior knowledge in images, called the Distribution-Weighted +Image-Transferred Point Cloud Quality Assessment (DWIT-PCQA). Recognizing the +human visual system (HVS) as the decision-maker in quality assessment +regardless of media types, we can emulate the evaluation criteria for human +perception via neural networks and further transfer the capability of quality +prediction from images to point clouds by leveraging the prior knowledge in the +images. Specifically, domain adaptation (DA) can be leveraged to bridge the +images and point clouds by aligning feature distributions of the two media in +the same feature space. However, the different manifestations of distortions in +images and point clouds make feature alignment a difficult task. To reduce the +alignment difficulty and consider the different distortion distribution during +alignment, we have derived formulas to decompose the optimization objective of +the conventional DA into two suboptimization functions with distortion as a +transition. Specifically, through network implementation, we propose the +distortion-guided biased feature alignment which integrates existing/estimated +distortion distribution into the adversarial DA framework, emphasizing common +distortion patterns during feature alignment. Besides, we propose the +quality-aware feature disentanglement to mitigate the destruction of the +mapping from features to quality during alignment with biased distortions. +Experimental results demonstrate that our proposed method exhibits reliable +performance compared to general blind PCQA methods without needing point cloud +annotations. + +
+
+
+
+
+ + ☆ Scalable Evaluation Framework for Foundation Models in Musculoskeletal + MRI Bridging Computational Innovation with Clinical Utility + + +
+ Foundation models hold transformative potential for medical imaging, but +their clinical utility requires rigorous evaluation to address their strengths +and limitations. This study introduces an evaluation framework for assessing +the clinical impact and translatability of SAM, MedSAM, and SAM2, using +musculoskeletal MRI as a case study. We tested these models across zero-shot +and finetuned paradigms to assess their ability to process diverse anatomical +structures and effectuate clinically reliable biomarkers, including cartilage +thickness, muscle volume, and disc height. We engineered a modular pipeline +emphasizing scalability, clinical relevance, and workflow integration, reducing +manual effort and aligning validation with end-user expectations. Hierarchical +modeling revealed how dataset mixing, anatomical complexity, and MRI +acquisition parameters influence performance, providing insights into the role +of imaging refinements in improving segmentation accuracy. This work +demonstrates how clinically focused evaluations can connect computational +advancements with tangible applications, creating a pathway for foundation +models to address medical challenges. By emphasizing interdisciplinary +collaboration and aligning technical innovation with clinical priorities, our +framework provides a roadmap for advancing machine learning technologies into +scalable and impactful biomedical solutions. + +
+
+
+
+
+ + ☆ Unraveling Normal Anatomy via Fluid-Driven Anomaly Randomization + + +
+ Data-driven machine learning has made significant strides in medical image +analysis. However, most existing methods are tailored to specific modalities +and assume a particular resolution (often isotropic). This limits their +generalizability in clinical settings, where variations in scan appearance +arise from differences in sequence parameters, resolution, and orientation. +Furthermore, most general-purpose models are designed for healthy subjects and +suffer from performance degradation when pathology is present. We introduce UNA +(Unraveling Normal Anatomy), the first modality-agnostic learning approach for +normal brain anatomy reconstruction that can handle both healthy scans and +cases with pathology. We propose a fluid-driven anomaly randomization method +that generates an unlimited number of realistic pathology profiles on-the-fly. +UNA is trained on a combination of synthetic and real data, and can be applied +directly to real images with potential pathology without the need for +fine-tuning. We demonstrate UNA's effectiveness in reconstructing healthy brain +anatomy and showcase its direct application to anomaly detection, using both +simulated and real images from 3D healthy and stroke datasets, including CT and +MRI scans. By bridging the gap between healthy and diseased images, UNA enables +the use of general-purpose models on diseased images, opening up new +opportunities for large-scale analysis of uncurated clinical images in the +presence of pathology. Code is available at https://github.com/peirong26/UNA. + +
+
+ comment: 16 pages, 6 figures +
+
+
+
+
+ + ☆ Meta-Feature Adapter: Integrating Environmental Metadata for Enhanced + Animal Re-identification + + +
+ Identifying individual animals within large wildlife populations is essential +for effective wildlife monitoring and conservation efforts. Recent advancements +in computer vision have shown promise in animal re-identification (Animal ReID) +by leveraging data from camera traps. However, existing methods rely +exclusively on visual data, neglecting environmental metadata that ecologists +have identified as highly correlated with animal behavior and identity, such as +temperature and circadian rhythms. To bridge this gap, we propose the +Meta-Feature Adapter (MFA), a lightweight module designed to integrate +environmental metadata into vision-language foundation models, such as CLIP, to +enhance Animal ReID performance. Our approach translates environmental metadata +into natural language descriptions, encodes them into metadata-aware text +embeddings, and incorporates these embeddings into image features through a +cross-attention mechanism. Furthermore, we introduce a Gated Cross-Attention +mechanism that dynamically adjusts the weights of metadata contributions, +further improving performance. To validate our approach, we constructed the +Metadata Augmented Animal Re-identification (MAAR) dataset, encompassing six +species from New Zealand and featuring paired image data and environmental +metadata. Extensive experiments demonstrate that MFA consistently improves +Animal ReID performance across multiple baseline models. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ Enhanced Extractor-Selector Framework and Symmetrization Weighted Binary + Cross-Entropy for Edge Detections + + +
+ Recent advancements have demonstrated the effectiveness of the +extractor-selector (E-S) framework in edge detection (ED) tasks, which achieves +state-of-the-art (SOTA) performance in both quantitative metrics and perceptual +quality. However, this method still falls short of fully exploiting the +potential of feature extractors, as selectors only operate on highly compressed +feature maps that lack diversity and suffer from substantial information loss. +Additionally, while union training can improve perceptual quality, the highest +evaluation scores are typically obtained without it, creating a trade-off +between quantitative accuracy and perceptual fidelity. To address these +limitations, we propose an enhanced E-S architecture, which utilizes richer, +less-loss feature representations and incorporates auxiliary features during +the selection process, thereby improving the effectiveness of the feature +selection mechanism. Additionally, we introduce a novel loss function, the +Symmetrization Weight Binary Cross-Entropy (SWBCE), which simultaneously +emphasizes both the recall of edge pixels and the suppression of erroneous edge +predictions, thereby enhancing the predictions both in the perceptual quality +and the prediction accuracy. The effectiveness and superiority of our +approaches over baseline models, the standard E-S framework, and the standard +Weight Binary Cross-Entropy (WBCE) loss function are demonstrated by extensive +experiments. For example, our enhanced E-S architecture trained with SWBCE loss +function achieves average improvements of 8.25$\%$, 8.01$\%$, and 33.25$\%$ in +ODS, OIS, and AP, measured on BIPED2 compared with the baseline models, +significantly outperforming the standard E-S method. The results set new +benchmarks for ED tasks, and highlight the potential of the methods in beyond. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ A light-weight model to generate NDWI from Sentinel-1 + + +
+ The use of Sentinel-2 images to compute Normalized Difference Water Index +(NDWI) has many applications, including water body area detection. However, +cloud cover poses significant challenges in this regard, which hampers the +effectiveness of Sentinel-2 images in this context. In this paper, we present a +deep learning model that can generate NDWI given Sentinel-1 images, thereby +overcoming this cloud barrier. We show the effectiveness of our model, where it +demonstrates a high accuracy of 0.9134 and an AUC of 0.8656 to predict the +NDWI. Additionally, we observe promising results with an R2 score of 0.4984 +(for regressing the NDWI values) and a Mean IoU of 0.4139 (for the underlying +segmentation task). In conclusion, our model offers a first and robust solution +for generating NDWI images directly from Sentinel-1 images and subsequent use +for various applications even under challenging conditions such as cloud cover +and nighttime. + +
+
+
+
+
+ + ☆ NUDT4MSTAR: A New Dataset and Benchmark Towards SAR Target Recognition + in the Wild + + +
+ Synthetic Aperture Radar (SAR) stands as an indispensable sensor for Earth +observation, owing to its unique capability for all-day imaging. Nevertheless, +in a data-driven era, the scarcity of large-scale datasets poses a significant +bottleneck to advancing SAR automatic target recognition (ATR) technology. This +paper introduces NUDT4MSTAR, a large-scale SAR dataset for vehicle target +recognition in the wild, including 40 target types and a wide array of imaging +conditions across 5 different scenes. NUDT4MSTAR represents a significant leap +forward in dataset scale, containing over 190,000 images-tenfold the size of +its predecessors. To enhance the utility of this dataset, we meticulously +annotate each image with detailed target information and imaging conditions. We +also provide data in both processed magnitude images and original complex +formats. Then, we construct a comprehensive benchmark consisting of 7 +experiments with 15 recognition methods focusing on the stable and effective +ATR issues. Besides, we conduct transfer learning experiments utilizing various +models trained on NUDT4MSTAR and applied to three other target datasets, +thereby demonstrating its substantial potential to the broader field of ground +objects ATR. Finally, we discuss this dataset's application value and ATR's +significant challenges. To the best of our knowledge, this work marks the +first-ever endeavor to create a large-scale dataset benchmark for fine-grained +SAR recognition in the wild, featuring an extensive collection of exhaustively +annotated vehicle images. We expect that the open source of NUDT4MSTAR will +facilitate the development of SAR ATR and attract a wider community of +researchers. + +
+
+ comment: 18 pages, 15 figures; link: + https://github.com/waterdisappear/NUDT4MSTAR +
+
+
+
+
+ + ☆ Contrast: A Hybrid Architecture of Transformers and State Space Models + for Low-Level Vision + + +
+ Transformers have become increasingly popular for image super-resolution (SR) +tasks due to their strong global context modeling capabilities. However, their +quadratic computational complexity necessitates the use of window-based +attention mechanisms, which restricts the receptive field and limits effective +context expansion. Recently, the Mamba architecture has emerged as a promising +alternative with linear computational complexity, allowing it to avoid window +mechanisms and maintain a large receptive field. Nevertheless, Mamba faces +challenges in handling long-context dependencies when high pixel-level +precision is required, as in SR tasks. This is due to its hidden state +mechanism, which can compress and store a substantial amount of context but +only in an approximate manner, leading to inaccuracies that transformers do not +suffer from. In this paper, we propose \textbf{Contrast}, a hybrid SR model +that combines \textbf{Con}volutional, \textbf{Tra}nsformer, and \textbf{St}ate +Space components, effectively blending the strengths of transformers and Mamba +to address their individual limitations. By integrating transformer and state +space mechanisms, \textbf{Contrast} compensates for the shortcomings of each +approach, enhancing both global context modeling and pixel-level accuracy. We +demonstrate that combining these two architectures allows us to mitigate the +problems inherent in each, resulting in improved performance on image +super-resolution tasks. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Polyhedra Encoding Transformers: Enhancing Diffusion MRI Analysis Beyond + Voxel and Volumetric Embedding + + +
+ Diffusion-weighted Magnetic Resonance Imaging (dMRI) is an essential tool in +neuroimaging. It is arguably the sole noninvasive technique for examining the +microstructural properties and structural connectivity of the brain. Recent +years have seen the emergence of machine learning and data-driven approaches +that enhance the speed, accuracy, and consistency of dMRI data analysis. +However, traditional deep learning models often fell short, as they typically +utilize pixel-level or volumetric patch-level embeddings similar to those used +in structural MRI, and do not account for the unique distribution of various +gradient encodings. In this paper, we propose a novel method called Polyhedra +Encoding Transformer (PE-Transformer) for dMRI, designed specifically to handle +spherical signals. Our approach involves projecting an icosahedral polygon onto +a unit sphere to resample signals from predetermined directions. These +resampled signals are then transformed into embeddings, which are processed by +a transformer encoder that incorporates orientational information reflective of +the icosahedral structure. Through experimental validation with various +gradient encoding protocols, our method demonstrates superior accuracy in +estimating multi-compartment models and Fiber Orientation Distributions (FOD), +outperforming both conventional CNN architectures and standard transformers. + +
+
+
+
+
+ + ☆ MSF: Efficient Diffusion Model Via Multi-Scale Latent Factorize + + +
+ Diffusion-based generative models have achieved remarkable progress in visual +content generation. However, traditional diffusion models directly denoise the +entire image from noisy inputs, disregarding the hierarchical structure present +in visual signals. This method is computationally intensive, especially for +high-resolution image generation. Signal processing often leverages +hierarchical decompositions; for instance, Fourier analysis decomposes signals +by frequency, while wavelet analysis captures localized frequency components, +reflecting both spatial and frequency information simultaneously. Inspired by +these principles, we propose a multiscale diffusion framework that generates +hierarchical visual representations, which are subsequently integrated to form +the final output. The diffusion model target, whether raw RGB pixels or latent +features from a Variational Autoencoder, s divided into multiple components +that each capture distinct spatial levels. The low-resolution component +contains the primary informative signal, while higher-resolution components add +high-frequency details, such as texture. This approach divides image generation +into two stages: producing a low-resolution base signal, followed by a +high-resolution residual signal. Both stages can be effectively modeled using +simpler, lightweight transformer architectures compared to full-resolution +generation. This decomposition is conceptually similar to wavelet decomposition +but offers a more streamlined and intuitive design. Our method, termed +MSF(short for Multi-Scale Factorization), achieves an FID of 2.2 and an IS of +255.4 on the ImageNet 256x256 benchmark, reducing computational costs by 50% +compared to baseline methods. + +
+
+
+
+
+ + ☆ YOLOSCM: An improved YOLO algorithm for cars detection + + +
+ Detecting objects in urban traffic images presents considerable difficulties +because of the following reasons: 1) These images are typically immense in +size, encompassing millions or even hundreds of millions of pixels, yet +computational resources are constrained. 2) The small size of vehicles in +certain scenarios leads to insufficient information for accurate detection. 3) +The uneven distribution of vehicles causes inefficient use of computational +resources. To address these issues, we propose YOLOSCM (You Only Look Once with +Segmentation Clustering Module), an efficient and effective framework. To +address the challenges of large-scale images and the non-uniform distribution +of vehicles, we propose a Segmentation Clustering Module (SCM). This module +adaptively identifies clustered regions, enabling the model to focus on these +areas for more precise detection. Additionally, we propose a new training +strategy to optimize the detection of small vehicles and densely packed targets +in complex urban traffic scenes. We perform extensive experiments on urban +traffic datasets to demonstrate the effectiveness and superiority of our +proposed approach. + +
+
+
+
+
+ + ☆ Multi-aspect Knowledge Distillation with Large Language Model + + +
+ Recent advancements in deep learning have significantly improved performance +on computer vision tasks. Previous image classification methods primarily +modify model architectures or add features, and they optimize models using +cross-entropy loss on class logits. Since they focus on classifying images with +considering class labels, these methods may struggle to learn various +\emph{aspects} of classes (e.g., natural positions and shape changes). +Rethinking the previous approach from a novel view, we propose a multi-aspect +knowledge distillation method using Multimodal Large Language Models (MLLMs). +Our approach involves: 1) querying Large Language Model with multi-aspect +questions relevant to the knowledge we want to transfer to the model, 2) +extracting corresponding logits from MLLM, and 3) expanding the model's output +dimensions to distill these multi-aspect logits. We then apply cross-entropy +loss to class logits and binary cross-entropy loss to multi-aspect logits. +Through our method, the model can learn not only the knowledge about visual +aspects but also the abstract and complex aspects that require a deeper +understanding. We primarily apply our method to image classification, and to +explore the potential for extending our model, we expand it to other tasks, +such as object detection. In all experimental results, our method improves the +performance of the baselines. Additionally, we analyze the effect of +multi-aspect knowledge distillation. These results demonstrate that our method +can transfer knowledge about various aspects to the model and the aspect +knowledge can enhance model performance in computer vision tasks. This paper +demonstrates the great potential of multi-aspect knowledge distillation, and we +believe it offers a promising direction for future research in computer vision +and beyond. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Retrievals Can Be Detrimental: A Contrastive Backdoor Attack Paradigm on + Retrieval-Augmented Diffusion Models + + +
+ Diffusion models (DMs) have recently demonstrated remarkable generation +capability. However, their training generally requires huge computational +resources and large-scale datasets. To solve these, recent studies empower DMs +with the advanced Retrieval-Augmented Generation (RAG) technique and propose +retrieval-augmented diffusion models (RDMs). By incorporating rich knowledge +from an auxiliary database, RAG enhances diffusion models' generation and +generalization ability while significantly reducing model parameters. Despite +the great success, RAG may introduce novel security issues that warrant further +investigation. In this paper, we reveal that the RDM is susceptible to backdoor +attacks by proposing a multimodal contrastive attack approach named BadRDM. Our +framework fully considers RAG's characteristics and is devised to manipulate +the retrieved items for given text triggers, thereby further controlling the +generated contents. Specifically, we first insert a tiny portion of images into +the retrieval database as target toxicity surrogates. Subsequently, a malicious +variant of contrastive learning is adopted to inject backdoors into the +retriever, which builds shortcuts from triggers to the toxicity surrogates. +Furthermore, we enhance the attacks through novel entropy-based selection and +generative augmentation strategies that can derive better toxicity surrogates. +Extensive experiments on two mainstream tasks demonstrate the proposed BadRDM +achieves outstanding attack effects while preserving the model's benign +utility. + +
+
+
+
+
+ + ☆ CuriousBot: Interactive Mobile Exploration via Actionable 3D Relational + Object Graph + + +
+ Mobile exploration is a longstanding challenge in robotics, yet current +methods primarily focus on active perception instead of active interaction, +limiting the robot's ability to interact with and fully explore its +environment. Existing robotic exploration approaches via active interaction are +often restricted to tabletop scenes, neglecting the unique challenges posed by +mobile exploration, such as large exploration spaces, complex action spaces, +and diverse object relations. In this work, we introduce a 3D relational object +graph that encodes diverse object relations and enables exploration through +active interaction. We develop a system based on this representation and +evaluate it across diverse scenes. Our qualitative and quantitative results +demonstrate the system's effectiveness and generalization capabilities, +outperforming methods that rely solely on vision-language models (VLMs). + +
+
+ comment: Project Page: https://curiousbot.theaiinstitute.com/ +
+
+
+
+
+ + ☆ Gradient-Free Adversarial Purification with Diffusion Models + + +
+ Adversarial training and adversarial purification are two effective and +practical defense methods to enhance a model's robustness against adversarial +attacks. However, adversarial training necessitates additional training, while +adversarial purification suffers from low time efficiency. More critically, +current defenses are designed under the perturbation-based adversarial threat +model, which is ineffective against the recently proposed unrestricted +adversarial attacks. In this paper, we propose an effective and efficient +adversarial defense method that counters both perturbation-based and +unrestricted adversarial attacks. Our defense is inspired by the observation +that adversarial attacks are typically located near the decision boundary and +are sensitive to pixel changes. To address this, we introduce adversarial +anti-aliasing to mitigate adversarial modifications. Additionally, we propose +adversarial super-resolution, which leverages prior knowledge from clean +datasets to benignly recover images. These approaches do not require additional +training and are computationally efficient without calculating gradients. +Extensive experiments against both perturbation-based and unrestricted +adversarial attacks demonstrate that our defense method outperforms +state-of-the-art adversarial purification methods. + +
+
+
+
+
+ + ☆ Deblur-Avatar: Animatable Avatars from Motion-Blurred Monocular Videos + + +
+ We introduce Deblur-Avatar, a novel framework for modeling high-fidelity, +animatable 3D human avatars from motion-blurred monocular video inputs. Motion +blur is prevalent in real-world dynamic video capture, especially due to human +movements in 3D human avatar modeling. Existing methods either (1) assume sharp +image inputs, failing to address the detail loss introduced by motion blur, or +(2) mainly consider blur by camera movements, neglecting the human motion blur +which is more common in animatable avatars. Our proposed approach integrates a +human movement-based motion blur model into 3D Gaussian Splatting (3DGS). By +explicitly modeling human motion trajectories during exposure time, we jointly +optimize the trajectories and 3D Gaussians to reconstruct sharp, high-quality +human avatars. We employ a pose-dependent fusion mechanism to distinguish +moving body regions, optimizing both blurred and sharp areas effectively. +Extensive experiments on synthetic and real-world datasets demonstrate that +Deblur-Avatar significantly outperforms existing methods in rendering quality +and quantitative metrics, producing sharp avatar reconstructions and enabling +real-time rendering under challenging motion blur conditions. + +
+
+
+
+
+ + ☆ From Cross-Modal to Mixed-Modal Visible-Infrared Re-Identification + + +
+ Visible-infrared person re-identification (VI-ReID) aims to match individuals +across different camera modalities, a critical task in modern surveillance +systems. While current VI-ReID methods focus on cross-modality matching, +real-world applications often involve mixed galleries containing both V and I +images, where state-of-the-art methods show significant performance limitations +due to large domain shifts and low discrimination across mixed modalities. This +is because gallery images from the same modality may have lower domain gaps but +correspond to different identities. This paper introduces a novel mixed-modal +ReID setting, where galleries contain data from both modalities. To address the +domain shift among inter-modal and low discrimination capacity in intra-modal +matching, we propose the Mixed Modality-Erased and -Related (MixER) method. The +MixER learning approach disentangles modality-specific and modality-shared +identity information through orthogonal decomposition, modality-confusion, and +ID-modality-related objectives. MixER enhances feature robustness across +modalities, improving cross-modal and mixed-modal settings performance. Our +extensive experiments on the SYSU-MM01, RegDB and LLMC datasets indicate that +our approach can provide state-of-the-art results using a single backbone, and +showcase the flexibility of our approach in mixed gallery applications. + +
+
+
+
+
+ + ☆ Reinforcement Learning Platform for Adversarial Black-box Attacks with + Custom Distortion Filters + + +
+ We present a Reinforcement Learning Platform for Adversarial Black-box +untargeted and targeted attacks, RLAB, that allows users to select from various +distortion filters to create adversarial examples. The platform uses a +Reinforcement Learning agent to add minimum distortion to input images while +still causing misclassification by the target model. The agent uses a novel +dual-action method to explore the input image at each step to identify +sensitive regions for adding distortions while removing noises that have less +impact on the target model. This dual action leads to faster and more efficient +convergence of the attack. The platform can also be used to measure the +robustness of image classification models against specific distortion types. +Also, retraining the model with adversarial samples significantly improved +robustness when evaluated on benchmark datasets. The proposed platform +outperforms state-of-the-art methods in terms of the average number of queries +required to cause misclassification. This advances trustworthiness with a +positive social impact. + +
+
+ comment: Under Review for 2025 AAAI Conference on Artificial Intelligence + Proceedings +
+
+
+
+
+ + ☆ StreamingRAG: Real-time Contextual Retrieval and Generation Framework + + +
+ Extracting real-time insights from multi-modal data streams from various +domains such as healthcare, intelligent transportation, and satellite remote +sensing remains a challenge. High computational demands and limited knowledge +scope restrict the applicability of Multi-Modal Large Language Models (MM-LLMs) +on these data streams. Traditional Retrieval-Augmented Generation (RAG) systems +address knowledge limitations of these models, but suffer from slow +preprocessing, making them unsuitable for real-time analysis. We propose +StreamingRAG, a novel RAG framework designed for streaming data. StreamingRAG +constructs evolving knowledge graphs capturing scene-object-entity +relationships in real-time. The knowledge graph achieves temporal-aware scene +representations using MM-LLMs and enables timely responses for specific events +or user queries. StreamingRAG addresses limitations in existing methods, +achieving significant improvements in real-time analysis (5-6x faster +throughput), contextual accuracy (through a temporal knowledge graph), and +reduced resource consumption (using lightweight models by 2-3x). + +
+
+ comment: Accepted and Presented at AI4Sys, HPDC 2024 +
+
+
+
+
+ + ☆ Expanding on the BRIAR Dataset: A Comprehensive Whole Body Biometric + Recognition Resource at Extreme Distances and Real-World Scenarios + (Collections 1-4) CVPR + + +
+ The state-of-the-art in biometric recognition algorithms and operational +systems has advanced quickly in recent years providing high accuracy and +robustness in more challenging collection environments and consumer +applications. However, the technology still suffers greatly when applied to +non-conventional settings such as those seen when performing identification at +extreme distances or from elevated cameras on buildings or mounted to UAVs. +This paper summarizes an extension to the largest dataset currently focused on +addressing these operational challenges, and describes its composition as well +as methodologies of collection, curation, and annotation. + +
+
+ comment: 10 pages, 11 figures, 2 tables, submitted to CVPR +
+
+
+
+
+ + ☆ Efficient 2D CT Foundation Model for Contrast Phase Classification + + +
+ Purpose: The purpose of this study is to harness the efficiency of a 2D +foundation model to develop a robust phase classifier that is resilient to +domain shifts. + Materials and Methods: This retrospective study utilized three public +datasets from separate institutions. A 2D foundation model was trained on the +DeepLesion dataset (mean age: 51.2, s.d.: 17.6; 2398 males) to generate +embeddings from 2D CT slices for downstream contrast phase classification. The +classifier was trained on the VinDr Multiphase dataset and externally validated +on the WAW-TACE dataset. The 2D model was also compared to three 3D supervised +models. + Results: On the VinDr dataset (146 male, 63 female, 56 unidentified), the +model achieved near-perfect AUROC scores and F1 scores of 99.2%, 94.2%, and +93.1% for non-contrast, arterial, and venous phases, respectively. The `Other' +category scored lower (F1: 73.4%) due to combining multiple contrast phases +into one class. On the WAW-TACE dataset (mean age: 66.1, s.d.: 10.0; 185 +males), the model showed strong performance with AUROCs of 91.0% and 85.6%, and +F1 scores of 87.3% and 74.1% for non-contrast and arterial phases. Venous phase +performance was lower, with AUROC and F1 scores of 81.7% and 70.2% +respectively, due to label mismatches. Compared to 3D supervised models, the +approach trained faster, performed as well or better, and showed greater +robustness to domain shifts. + Conclusion: The robustness of the 2D Foundation model may be potentially +useful for automation of hanging protocols and data orchestration for clinical +deployment of AI algorithms. + +
+
+
+
+
+ + ☆ Prior Knowledge Injection into Deep Learning Models Predicting Gene + Expression from Whole Slide Images + + +
+ Cancer diagnosis and prognosis primarily depend on clinical parameters such +as age and tumor grade, and are increasingly complemented by molecular data, +such as gene expression, from tumor sequencing. However, sequencing is costly +and delays oncology workflows. Recent advances in Deep Learning allow to +predict molecular information from morphological features within Whole Slide +Images (WSIs), offering a cost-effective proxy of the molecular markers. While +promising, current methods lack the robustness to fully replace direct +sequencing. Here we aim to improve existing methods by introducing a +model-agnostic framework that allows to inject prior knowledge on gene-gene +interactions into Deep Learning architectures, thereby increasing accuracy and +robustness. We design the framework to be generic and flexibly adaptable to a +wide range of architectures. In a case study on breast cancer, our strategy +leads to an average increase of 983 significant genes (out of 25,761) across +all 18 experiments, with 14 generalizing to an increase on an independent +dataset. Our findings reveal a high potential for injection of prior knowledge +to increase gene expression prediction performance from WSIs across a wide +range of architectures. + +
+
+
+
+
+ + ☆ Revisiting CLIP: Efficient Alignment of 3D MRI and Tabular Data using + Domain-Specific Foundation Models + + +
+ Multi-modal models require aligned, shared embedding spaces. However, common +CLIP-based approaches need large amounts of samples and do not natively support +3D or tabular data, both of which are crucial in the medical domain. To address +these issues, we revisit CLIP-style alignment by training a domain-specific 3D +foundation model as an image encoder and demonstrate that modality alignment is +feasible with only 62 MRI scans. Our approach is enabled by a simple embedding +accumulation strategy required for training in 3D, which scales the amount of +negative pairs across batches in order to stabilize training. We perform a +thorough evaluation of various design choices, including the choice of backbone +and loss functions, and evaluate the proposed methodology on zero-shot +classification and image-retrieval tasks. While zero-shot image-retrieval +remains challenging, zero-shot classification results demonstrate that the +proposed approach can meaningfully align the representations of 3D MRI with +tabular data. + +
+
+ comment: 10 pages, 2 figures. To be published in ISBI 2025 +
+
+
+
+
+ + ☆ SIDDA: SInkhorn Dynamic Domain Adaptation for Image Classification with + Equivariant Neural Networks + + +
+ Modern neural networks (NNs) often do not generalize well in the presence of +a "covariate shift"; that is, in situations where the training and test data +distributions differ, but the conditional distribution of classification labels +remains unchanged. In such cases, NN generalization can be reduced to a problem +of learning more domain-invariant features. Domain adaptation (DA) methods +include a range of techniques aimed at achieving this; however, these methods +have struggled with the need for extensive hyperparameter tuning, which then +incurs significant computational costs. In this work, we introduce SIDDA, an +out-of-the-box DA training algorithm built upon the Sinkhorn divergence, that +can achieve effective domain alignment with minimal hyperparameter tuning and +computational overhead. We demonstrate the efficacy of our method on multiple +simulated and real datasets of varying complexity, including simple shapes, +handwritten digits, and real astronomical observations. SIDDA is compatible +with a variety of NN architectures, and it works particularly well in improving +classification accuracy and model calibration when paired with equivariant +neural networks (ENNs). We find that SIDDA enhances the generalization +capabilities of NNs, achieving up to a $\approx40\%$ improvement in +classification accuracy on unlabeled target data. We also study the efficacy of +DA on ENNs with respect to the varying group orders of the dihedral group +$D_N$, and find that the model performance improves as the degree of +equivariance increases. Finally, we find that SIDDA enhances model calibration +on both source and target data--achieving over an order of magnitude +improvement in the ECE and Brier score. SIDDA's versatility, combined with its +automated approach to domain alignment, has the potential to advance +multi-dataset studies by enabling the development of highly generalizable +models. + +
+
+ comment: 25 pages, 5 figures, 4 tables. code available at: + https://github.com/deepskies/SIDDA +
+
+
+
+
+ + ☆ LLM-guided Instance-level Image Manipulation with Diffusion U-Net + Cross-Attention Maps + + +
+ The advancement of text-to-image synthesis has introduced powerful generative +models capable of creating realistic images from textual prompts. However, +precise control over image attributes remains challenging, especially at the +instance level. While existing methods offer some control through fine-tuning +or auxiliary information, they often face limitations in flexibility and +accuracy. To address these challenges, we propose a pipeline leveraging Large +Language Models (LLMs), open-vocabulary detectors, cross-attention maps and +intermediate activations of diffusion U-Net for instance-level image +manipulation. Our method detects objects mentioned in the prompt and present in +the generated image, enabling precise manipulation without extensive training +or input masks. By incorporating cross-attention maps, our approach ensures +coherence in manipulated images while controlling object positions. Our method +enables precise manipulations at the instance level without fine-tuning or +auxiliary information such as masks or bounding boxes. Code is available at +https://github.com/Palandr123/DiffusionU-NetLLM + +
+
+ comment: Presented at BMVC 2024 +
+
+
+
+
+ + ☆ Implicit Neural Surface Deformation with Explicit Velocity Fields ICLR 2025 + + +
+ In this work, we introduce the first unsupervised method that simultaneously +predicts time-varying neural implicit surfaces and deformations between pairs +of point clouds. We propose to model the point movement using an explicit +velocity field and directly deform a time-varying implicit field using the +modified level-set equation. This equation utilizes an iso-surface evolution +with Eikonal constraints in a compact formulation, ensuring the integrity of +the signed distance field. By applying a smooth, volume-preserving constraint +to the velocity field, our method successfully recovers physically plausible +intermediate shapes. Our method is able to handle both rigid and non-rigid +deformations without any intermediate shape supervision. Our experimental +results demonstrate that our method significantly outperforms existing works, +delivering superior results in both quality and efficiency. + +
+
+ comment: ICLR 2025, 10 pages +
+
+
+
+
+ + ☆ INDIGO+: A Unified INN-Guided Probabilistic Diffusion Algorithm for + Blind and Non-Blind Image Restoration + + +
+ Generative diffusion models are becoming one of the most popular prior in +image restoration (IR) tasks due to their remarkable ability to generate +realistic natural images. Despite achieving satisfactory results, IR methods +based on diffusion models present several limitations. First of all, most +non-blind approaches require an analytical expression of the degradation model +to guide the sampling process. Secondly, most existing blind approaches rely on +families of pre-defined degradation models for training their deep networks. +The above issues limit the flexibility of these approaches and so their ability +to handle real-world degradation tasks. In this paper, we propose a novel +INN-guided probabilistic diffusion algorithm for non-blind and blind image +restoration, namely INDIGO and BlindINDIGO, which combines the merits of the +perfect reconstruction property of invertible neural networks (INN) with the +strong generative capabilities of pre-trained diffusion models. Specifically, +we train the forward process of the INN to simulate an arbitrary degradation +process and use the inverse to obtain an intermediate image that we use to +guide the reverse diffusion sampling process through a gradient step. We also +introduce an initialization strategy, to further improve the performance and +inference speed of our algorithm. Experiments demonstrate that our algorithm +obtains competitive results compared with recently leading methods both +quantitatively and visually on synthetic and real-world low-quality images. + +
+
+ comment: Accepted by IEEE Journal of Selected Topics in Signal Processing + (JSTSP) +
+
+
+
+
+ + ☆ Leveraging Multiphase CT for Quality Enhancement of Portal Venous CT: + Utility for Pancreas Segmentation + + +
+ Multiphase CT studies are routinely obtained in clinical practice for +diagnosis and management of various diseases, such as cancer. However, the CT +studies can be acquired with low radiation doses, different scanners, and are +frequently affected by motion and metal artifacts. Prior approaches have +targeted the quality improvement of one specific CT phase (e.g., non-contrast +CT). In this work, we hypothesized that leveraging multiple CT phases for the +quality enhancement of one phase may prove advantageous for downstream tasks, +such as segmentation. A 3D progressive fusion and non-local (PFNL) network was +developed. It was trained with three degraded (low-quality) phases +(non-contrast, arterial, and portal venous) to enhance the quality of the +portal venous phase. Then, the effect of scan quality enhancement was evaluated +using a proxy task of pancreas segmentation, which is useful for tracking +pancreatic cancer. The proposed approach improved the pancreas segmentation by +3% over the corresponding low-quality CT scan. To the best of our knowledge, we +are the first to harness multiphase CT for scan quality enhancement and +improved pancreas segmentation. + +
+
+ comment: ISBI 2025 +
+
+
+
+
+ + ♻ ☆ Truncated Consistency Models ICLR 2025 + + +
+ Consistency models have recently been introduced to accelerate sampling from +diffusion models by directly predicting the solution (i.e., data) of the +probability flow ODE (PF ODE) from initial noise. However, the training of +consistency models requires learning to map all intermediate points along PF +ODE trajectories to their corresponding endpoints. This task is much more +challenging than the ultimate objective of one-step generation, which only +concerns the PF ODE's noise-to-data mapping. We empirically find that this +training paradigm limits the one-step generation performance of consistency +models. To address this issue, we generalize consistency training to the +truncated time range, which allows the model to ignore denoising tasks at +earlier time steps and focus its capacity on generation. We propose a new +parameterization of the consistency function and a two-stage training procedure +that prevents the truncated-time training from collapsing to a trivial +solution. Experiments on CIFAR-10 and ImageNet $64\times64$ datasets show that +our method achieves better one-step and two-step FIDs than the state-of-the-art +consistency models such as iCT-deep, using more than 2$\times$ smaller +networks. Project page: https://truncated-cm.github.io/ + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Accelerate High-Quality Diffusion Models with Inner Loop Feedback + + +
+ We propose Inner Loop Feedback (ILF), a novel approach to accelerate +diffusion models' inference. ILF trains a lightweight module to predict future +features in the denoising process by leveraging the outputs from a chosen +diffusion backbone block at a given time step. This approach exploits two key +intuitions; (1) the outputs of a given block at adjacent time steps are +similar, and (2) performing partial computations for a step imposes a lower +burden on the model than skipping the step entirely. Our method is highly +flexible, since we find that the feedback module itself can simply be a block +from the diffusion backbone, with all settings copied. Its influence on the +diffusion forward can be tempered with a learnable scaling factor from zero +initialization. We train this module using distillation losses; however, unlike +some prior work where a full diffusion backbone serves as the student, our +model freezes the backbone, training only the feedback module. While many +efforts to optimize diffusion models focus on achieving acceptable image +quality in extremely few steps (1-4 steps), our emphasis is on matching best +case results (typically achieved in 20 steps) while significantly reducing +runtime. ILF achieves this balance effectively, demonstrating strong +performance for both class-to-image generation with diffusion transformer (DiT) +and text-to-image generation with DiT-based PixArt-alpha and PixArt-sigma. The +quality of ILF's 1.7x-1.8x speedups are confirmed by FID, CLIP score, CLIP +Image Quality Assessment, ImageReward, and qualitative comparisons. Project +information is available at https://mgwillia.github.io/ilf. + +
+
+ comment: submission currently under review; 20 pages, 17 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Enhanced Encoder-Decoder Architecture for Accurate Monocular Depth + Estimation + + +
+ Estimating depth from a single 2D image is a challenging task due to the lack +of stereo or multi-view data, which are typically required for depth +perception. In state-of-the-art architectures, the main challenge is to +efficiently capture complex objects and fine-grained details, which are often +difficult to predict. This paper introduces a novel deep learning-based +approach using an enhanced encoder-decoder architecture, where the +Inception-ResNet-v2 model serves as the encoder. This is the first instance of +utilizing Inception-ResNet-v2 as an encoder for monocular depth estimation, +demonstrating improved performance over previous models. It incorporates +multi-scale feature extraction to enhance depth prediction accuracy across +various object sizes and distances. We propose a composite loss function +comprising depth loss, gradient edge loss, and Structural Similarity Index +Measure (SSIM) loss, with fine-tuned weights to optimize the weighted sum, +ensuring a balance across different aspects of depth estimation. Experimental +results on the KITTI dataset show that our model achieves a significantly +faster inference time of 0.019 seconds, outperforming vision transformers in +efficiency while maintaining good accuracy. On the NYU Depth V2 dataset, the +model establishes state-of-the-art performance, with an Absolute Relative Error +(ARE) of 0.064, a Root Mean Square Error (RMSE) of 0.228, and an accuracy of +89.3% for $\delta$ < 1.25. These metrics demonstrate that our model can +accurately and efficiently predict depth even in challenging scenarios, +providing a practical solution for real-time applications. + +
+
+
+
+
+ + ♻ ☆ DART: Denoising Autoregressive Transformer for Scalable Text-to-Image + Generation ICLR2025 + + +
+ Diffusion models have become the dominant approach for visual generation. +They are trained by denoising a Markovian process which gradually adds noise to +the input. We argue that the Markovian property limits the model's ability to +fully utilize the generation trajectory, leading to inefficiencies during +training and inference. In this paper, we propose DART, a transformer-based +model that unifies autoregressive (AR) and diffusion within a non-Markovian +framework. DART iteratively denoises image patches spatially and spectrally +using an AR model that has the same architecture as standard language models. +DART does not rely on image quantization, which enables more effective image +modeling while maintaining flexibility. Furthermore, DART seamlessly trains +with both text and image data in a unified model. Our approach demonstrates +competitive performance on class-conditioned and text-to-image generation +tasks, offering a scalable, efficient alternative to traditional diffusion +models. Through this unified framework, DART sets a new benchmark for scalable, +high-quality image synthesis. + +
+
+ comment: Accepted by ICLR2025 +
+
+
+
+
+ + ♻ ☆ MuMA-ToM: Multi-modal Multi-Agent Theory of Mind + + +
+ Understanding people's social interactions in complex real-world scenarios +often relies on intricate mental reasoning. To truly understand how and why +people interact with one another, we must infer the underlying mental states +that give rise to the social interactions, i.e., Theory of Mind reasoning in +multi-agent interactions. Additionally, social interactions are often +multi-modal -- we can watch people's actions, hear their conversations, and/or +read about their past behaviors. For AI systems to successfully and safely +interact with people in real-world environments, they also need to understand +people's mental states as well as their inferences about each other's mental +states based on multi-modal information about their interactions. For this, we +introduce MuMA-ToM, a Multi-modal Multi-Agent Theory of Mind benchmark. +MuMA-ToM is the first multi-modal Theory of Mind benchmark that evaluates +mental reasoning in embodied multi-agent interactions. In MuMA-ToM, we provide +video and text descriptions of people's multi-modal behavior in realistic +household environments. Based on the context, we then ask questions about +people's goals, beliefs, and beliefs about others' goals. We validated MuMA-ToM +in a human experiment and provided a human baseline. We also proposed a novel +multi-modal, multi-agent ToM model, LIMP (Language model-based Inverse +Multi-agent Planning). Our experimental results show that LIMP significantly +outperforms state-of-the-art methods, including large multi-modal models (e.g., +GPT-4o, Gemini-1.5 Pro) and a recent multi-modal ToM model, BIP-ALM. + +
+
+ comment: AAAI-25 (Oral). Project website: + https://scai.cs.jhu.edu/projects/MuMA-ToM/ Code: + https://github.com/SCAI-JHU/MuMA-ToM +
+
+
+
+
+ + ♻ ☆ 3DGSR: Implicit Surface Reconstruction with 3D Gaussian Splatting + + +
+ In this paper, we present an implicit surface reconstruction method with 3D +Gaussian Splatting (3DGS), namely 3DGSR, that allows for accurate 3D +reconstruction with intricate details while inheriting the high efficiency and +rendering quality of 3DGS. The key insight is incorporating an implicit signed +distance field (SDF) within 3D Gaussians to enable them to be aligned and +jointly optimized. First, we introduce a differentiable SDF-to-opacity +transformation function that converts SDF values into corresponding Gaussians' +opacities. This function connects the SDF and 3D Gaussians, allowing for +unified optimization and enforcing surface constraints on the 3D Gaussians. +During learning, optimizing the 3D Gaussians provides supervisory signals for +SDF learning, enabling the reconstruction of intricate details. However, this +only provides sparse supervisory signals to the SDF at locations occupied by +Gaussians, which is insufficient for learning a continuous SDF. Then, to +address this limitation, we incorporate volumetric rendering and align the +rendered geometric attributes (depth, normal) with those derived from 3D +Gaussians. This consistency regularization introduces supervisory signals to +locations not covered by discrete 3D Gaussians, effectively eliminating +redundant surfaces outside the Gaussian sampling range. Our extensive +experimental results demonstrate that our 3DGSR method enables high-quality 3D +surface reconstruction while preserving the efficiency and rendering quality of +3DGS. Besides, our method competes favorably with leading surface +reconstruction techniques while offering a more efficient learning process and +much better rendering qualities. The code will be available at +https://github.com/CVMI-Lab/3DGSR. + +
+
+
+
+
+ + ♻ ☆ How to Efficiently Annotate Images for Best-Performing Deep Learning + Based Segmentation Models: An Empirical Study with Weak and Noisy Annotations + and Segment Anything Model + + +
+ Deep neural networks (DNNs) have demonstrated exceptional performance across +various image segmentation tasks. However, the process of preparing datasets +for training segmentation DNNs is both labor-intensive and costly, as it +typically requires pixel-level annotations for each object of interest. To +mitigate this challenge, alternative approaches such as using weak labels +(e.g., bounding boxes or scribbles) or less precise (noisy) annotations can be +employed. Noisy and weak labels are significantly quicker to generate, allowing +for more annotated images within the same time frame. However, the potential +decrease in annotation quality may adversely impact the segmentation +performance of the resulting model. In this study, we conducted a comprehensive +cost-effectiveness evaluation on six variants of annotation strategies (9~10 +sub-variants in total) across 4 datasets and conclude that the common practice +of precisely outlining objects of interest is virtually never the optimal +approach when annotation budget is limited. Both noisy and weak annotations +showed usage cases that yield similar performance to the perfectly annotated +counterpart, yet had significantly better cost-effectiveness. We hope our +findings will help researchers be aware of the different available options and +use their annotation budgets more efficiently, especially in cases where +accurately acquiring labels for target objects is particularly costly. Our code +will be made available on https://github.com/yzluka/AnnotationEfficiency2D. + +
+
+ comment: Supplemental information is in appendix +
+
+
+
+
+ + ♻ ☆ Invariance Principle Meets Vicinal Risk Minimization + + +
+ Deep learning models excel in computer vision tasks but often fail to +generalize to out-of-distribution (OOD) domains. Invariant Risk Minimization +(IRM) aims to address OOD generalization by learning domain-invariant features. +However, IRM struggles with datasets exhibiting significant diversity shifts. +While data augmentation methods like Mixup and Semantic Data Augmentation (SDA) +enhance diversity, they risk over-augmentation and label instability. To +address these challenges, we propose a domain-shared Semantic Data Augmentation +(SDA) module, a novel implementation of Variance Risk Minimization (VRM) +designed to enhance dataset diversity while maintaining label consistency. We +further provide a Rademacher complexity analysis, establishing a tighter +generalization error bound compared to baseline methods. Extensive evaluations +on OOD benchmarks, including PACS, VLCS, OfficeHome, and TerraIncognita, +demonstrate consistent performance improvements over state-of-the-art domain +generalization methods. + +
+
+
+
+
+ + ♻ ☆ Is Large-Scale Pretraining the Secret to Good Domain Generalization? + + +
+ Multi-Source Domain Generalization (DG) is the task of training on multiple +source domains and achieving high classification performance on unseen target +domains. Recent methods combine robust features from web-scale pretrained +backbones with new features learned from source data, and this has dramatically +improved benchmark results. However, it remains unclear if DG finetuning +methods are becoming better over time, or if improved benchmark performance is +simply an artifact of stronger pre-training. Prior studies have shown that +perceptual similarity to pre-training data correlates with zero-shot +performance, but we find the effect limited in the DG setting. Instead, we +posit that having perceptually similar data in pretraining is not enough; and +that it is how well these data were learned that determines performance. This +leads us to introduce the Alignment Hypothesis, which states that the final DG +performance will be high if and only if alignment of image and class label text +embeddings is high. Our experiments confirm the Alignment Hypothesis is true, +and we use it as an analysis tool of existing DG methods evaluated on DomainBed +datasets by splitting evaluation data into In-pretraining (IP) and +Out-of-pretraining (OOP). We show that all evaluated DG methods struggle on +DomainBed-OOP, while recent methods excel on DomainBed-IP. Put together, our +findings highlight the need for DG methods which can generalize beyond +pretraining alignment. + +
+
+
+
+
+ + ♻ ☆ ClawMachine: Learning to Fetch Visual Tokens for Referential + Comprehension ICLR 2025 + + +
+ Aligning vision and language concepts at a finer level remains an essential +topic of multimodal large language models (MLLMs), particularly for tasks such +as referring and grounding. Existing methods, such as proxy encoding and +geometry encoding, incorporate additional syntax to encode spatial information, +imposing extra burdens when communicating between language and vision modules. +In this study, we propose ClawMachine, offering a new methodology that +explicitly notates each entity using token collectives groups of visual tokens +that collaboratively represent higher level semantics. A hybrid perception +mechanism is also explored to perceive and understand scenes from both discrete +and continuous spaces. Our method unifies the prompt and answer of visual +referential tasks without using additional syntax. By leveraging a joint +vision-language vocabulary, ClawMachine further integrates referring and +grounding in an auto-regressive manner, demonstrating great potential with +scaled-up pre-training data. Experiments show that ClawMachine achieves +superior performance on scene-level and referential understanding tasks with +higher efficiency. It also exhibits the potential to integrate multi-source +information for complex visual reasoning, which is beyond the capability of +many MLLMs. Our code is available at github.com/martian422/ClawMachine. + +
+
+ comment: ICLR 2025. Code is available at github.com/martian422/ClawMachine +
+
+
+
+
+ + ♻ ☆ Aligning Human Motion Generation with Human Perceptions + + +
+ Human motion generation is a critical task with a wide range of applications. +Achieving high realism in generated motions requires naturalness, smoothness, +and plausibility. Despite rapid advancements in the field, current generation +methods often fall short of these goals. Furthermore, existing evaluation +metrics typically rely on ground-truth-based errors, simple heuristics, or +distribution distances, which do not align well with human perceptions of +motion quality. In this work, we propose a data-driven approach to bridge this +gap by introducing a large-scale human perceptual evaluation dataset, +MotionPercept, and a human motion critic model, MotionCritic, that capture +human perceptual preferences. Our critic model offers a more accurate metric +for assessing motion quality and could be readily integrated into the motion +generation pipeline to enhance generation quality. Extensive experiments +demonstrate the effectiveness of our approach in both evaluating and improving +the quality of generated human motions by aligning with human perceptions. Code +and data are publicly available at https://motioncritic.github.io/. + +
+
+ comment: Project page: https://motioncritic.github.io/ +
+
+
+
+
+ + ♻ ☆ VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video + Understanding + + +
+ In this paper, we propose VideoLLaMA3, a more advanced multimodal foundation +model for image and video understanding. The core design philosophy of +VideoLLaMA3 is vision-centric. The meaning of "vision-centric" is two-fold: the +vision-centric training paradigm and vision-centric framework design. The key +insight of our vision-centric training paradigm is that high-quality image-text +data is crucial for both image and video understanding. Instead of preparing +massive video-text datasets, we focus on constructing large-scale and +high-quality image-text datasets. VideoLLaMA3 has four training stages: 1) +Vision Encoder Adaptation, which enables vision encoder to accept images of +variable resolutions as input; 2) Vision-Language Alignment, which jointly +tunes the vision encoder, projector, and LLM with large-scale image-text data +covering multiple types (including scene images, documents, charts) as well as +text-only data. 3) Multi-task Fine-tuning, which incorporates image-text SFT +data for downstream tasks and video-text data to establish a foundation for +video understanding. 4) Video-centric Fine-tuning, which further improves the +model's capability in video understanding. As for the framework design, to +better capture fine-grained details in images, the pretrained vision encoder is +adapted to encode images of varying sizes into vision tokens with corresponding +numbers, rather than a fixed number of tokens. For video inputs, we reduce the +number of vision tokens according to their similarity so that the +representation of videos will be more precise and compact. Benefit from +vision-centric designs, VideoLLaMA3 achieves compelling performances in both +image and video understanding benchmarks. + +
+
+ comment: BZ, KL, ZC, ZH, YY, GC, SL, YJ, HZ, and XL contributed equally to + this project. Code: https://github.com/DAMO-NLP-SG/VideoLLaMA3 +
+
+
+
+
+ + ♻ ☆ Learning Hemodynamic Scalar Fields on Coronary Artery Meshes: A + Benchmark of Geometric Deep Learning Models + + +
+ Coronary artery disease, caused by the narrowing of coronary vessels due to +atherosclerosis, is the leading cause of death worldwide. The diagnostic gold +standard, fractional flow reserve (FFR), measures the trans-stenotic pressure +ratio during maximal vasodilation but is invasive and costly. This has driven +the development of virtual FFR (vFFR) using computational fluid dynamics (CFD) +to simulate coronary flow. Geometric deep learning algorithms have shown +promise for learning features on meshes, including cardiovascular research +applications. This study empirically analyzes various backends for predicting +vFFR fields in coronary arteries as CFD surrogates, comparing six backends for +learning hemodynamics on meshes using CFD solutions as ground truth. + The study has two parts: i) Using 1,500 synthetic left coronary artery +bifurcations, models were trained to predict pressure-related fields for vFFR +reconstruction, comparing different learning variables. ii) Using 427 +patient-specific CFD simulations, experiments were repeated focusing on the +best-performing learning variable from the synthetic dataset. + Most backends performed well on the synthetic dataset, especially when +predicting pressure drop over the manifold. Transformer-based backends +outperformed others when predicting pressure and vFFR fields and were the only +models achieving strong performance on patient-specific data, excelling in both +average per-point error and vFFR accuracy in stenotic lesions. + These results suggest geometric deep learning backends can effectively +replace CFD for simple geometries, while transformer-based networks are +superior for complex, heterogeneous datasets. Pressure drop was identified as +the optimal network output for learning pressure-related fields. + +
+
+
+
+
+ + ♻ ☆ A Simple Aerial Detection Baseline of Multimodal Language Models + + +
+ The multimodal language models (MLMs) based on generative pre-trained +Transformer are considered powerful candidates for unifying various domains and +tasks. MLMs developed for remote sensing (RS) have demonstrated outstanding +performance in multiple tasks, such as visual question answering and visual +grounding. In addition to visual grounding that detects specific objects +corresponded to given instruction, aerial detection, which detects all objects +of multiple categories, is also a valuable and challenging task for RS +foundation models. However, aerial detection has not been explored by existing +RS MLMs because the autoregressive prediction mechanism of MLMs differs +significantly from the detection outputs. In this paper, we present a simple +baseline for applying MLMs to aerial detection for the first time, named +LMMRotate. Specifically, we first introduce a normalization method to transform +detection outputs into textual outputs to be compatible with the MLM framework. +Then, we propose a evaluation method, which ensures a fair comparison between +MLMs and conventional object detection models. We construct the baseline by +fine-tuning open-source general-purpose MLMs and achieve impressive detection +performance comparable to conventional detector. We hope that this baseline +will serve as a reference for future MLM development, enabling more +comprehensive capabilities for understanding RS images. Code is available at +https://github.com/Li-Qingyun/mllm-mmrotate. + +
+
+ comment: 4 pages, 1 table, 4 figures +
+
+
+
+
+ + ♻ ☆ HFGCN:Hypergraph Fusion Graph Convolutional Networks for Skeleton-Based + Action Recognition + + +
+ In recent years, action recognition has received much attention and wide +application due to its important role in video understanding. Most of the +researches on action recognition methods focused on improving the performance +via various deep learning methods rather than the classification of skeleton +points. The topological modeling between skeleton points and body parts was +seldom considered. Although some studies have used a data-driven approach to +classify the topology of the skeleton point, the nature of the skeleton point +in terms of kinematics has not been taken into consideration. Therefore, in +this paper, we draw on the theory of kinematics to adapt the topological +relations of the skeleton point and propose a topological relation +classification based on body parts and distance from core of body. To +synthesize these topological relations for action recognition, we propose a +novel Hypergraph Fusion Graph Convolutional Network (HFGCN). In particular, the +proposed model is able to focus on the human skeleton points and the different +body parts simultaneously, and thus construct the topology, which improves the +recognition accuracy obviously. We use a hypergraph to represent the +categorical relationships of these skeleton points and incorporate the +hypergraph into a graph convolution network to model the higher-order +relationships among the skeleton points and enhance the feature representation +of the network. In addition, our proposed hypergraph attention module and +hypergraph graph convolution module optimize topology modeling in temporal and +channel dimensions, respectively, to further enhance the feature representation +of the network. We conducted extensive experiments on three widely used +datasets.The results validate that our proposed method can achieve the best +performance when compared with the state-of-the-art skeleton-based methods. + +
+
+
+
+
+ + ♻ ☆ SaRPFF: A Self-Attention with Register-based Pyramid Feature Fusion + module for enhanced RLD detection + + +
+ Detecting objects across varying scales is still a challenge in computer +vision, particularly in agricultural applications like Rice Leaf Disease (RLD) +detection, where objects exhibit significant scale variations (SV). +Conventional object detection (OD) like Faster R-CNN, SSD, and YOLO methods +often fail to effectively address SV, leading to reduced accuracy and missed +detections. To tackle this, we propose SaRPFF (Self-Attention with +Register-based Pyramid Feature Fusion), a novel module designed to enhance +multi-scale object detection. SaRPFF integrates 2D-Multi-Head Self-Attention +(MHSA) with Register tokens, improving feature interpretability by mitigating +artifacts within MHSA. Additionally, it integrates efficient attention atrous +convolutions into the pyramid feature fusion and introduce a deconvolutional +layer for refined up-sampling. We evaluate SaRPFF on YOLOv7 using the MRLD and +COCO datasets. Our approach demonstrates a +2.61% improvement in Average +Precision (AP) on the MRLD dataset compared to the baseline FPN method in +YOLOv7. Furthermore, SaRPFF outperforms other FPN variants, including BiFPN, +NAS-FPN, and PANET, showcasing its versatility and potential to advance OD +techniques. This study highlights SaRPFF effectiveness in addressing SV +challenges and its adaptability across FPN-based OD models. + +
+
+
+
+
+ + ♻ ☆ Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid + Prototyping in Virtual Reality Applications + + +
+ SLAM is a foundational technique with broad applications in robotics and +AR/VR. SLAM simulations evaluate new concepts, but testing on +resource-constrained devices, such as VR HMDs, faces challenges: high +computational cost and restricted sensor data access. This work proposes a +sparse framework using mesh geometry projections as features, which improves +efficiency and circumvents direct sensor data access, advancing SLAM research +as we demonstrate in VR and through numerical evaluation. + +
+
+ comment: Accepted to ENPT XR at IEEE VR 2025 +
+
+
+
+
+ + ♻ ☆ OmniHD-Scenes: A Next-Generation Multimodal Dataset for Autonomous + Driving + + +
+ The rapid advancement of deep learning has intensified the need for +comprehensive data for use by autonomous driving algorithms. High-quality +datasets are crucial for the development of effective data-driven autonomous +driving solutions. Next-generation autonomous driving datasets must be +multimodal, incorporating data from advanced sensors that feature extensive +data coverage, detailed annotations, and diverse scene representation. To +address this need, we present OmniHD-Scenes, a large-scale multimodal dataset +that provides comprehensive omnidirectional high-definition data. The +OmniHD-Scenes dataset combines data from 128-beam LiDAR, six cameras, and six +4D imaging radar systems to achieve full environmental perception. The dataset +comprises 1501 clips, each approximately 30-s long, totaling more than 450K +synchronized frames and more than 5.85 million synchronized sensor data points. +We also propose a novel 4D annotation pipeline. To date, we have annotated 200 +clips with more than 514K precise 3D bounding boxes. These clips also include +semantic segmentation annotations for static scene elements. Additionally, we +introduce a novel automated pipeline for generation of the dense occupancy +ground truth, which effectively leverages information from non-key frames. +Alongside the proposed dataset, we establish comprehensive evaluation metrics, +baseline models, and benchmarks for 3D detection and semantic occupancy +prediction. These benchmarks utilize surround-view cameras and 4D imaging radar +to explore cost-effective sensor solutions for autonomous driving applications. +Extensive experiments demonstrate the effectiveness of our low-cost sensor +configuration and its robustness under adverse conditions. Data will be +released at https://www.2077ai.com/OmniHD-Scenes. + +
+
+
+
+
+ + ♻ ☆ CHaRNet: Conditioned Heatmap Regression for Robust Dental Landmark + Localization + + +
+ Identifying anatomical landmarks in 3D dental models is crucial for +orthodontic treatment. Manually placing these key points is complex, +time-consuming, and requires expert knowledge. While some machine learning +methods have been proposed for automatic tooth landmark detection in 3D +Intraoral Scans (IOS), research remains limited, with no fully end-to-end +approaches that avoid teeth segmentation. We propose CHaRNet (Conditioned +Heatmap Regression Network), the first end-to-end deep learning method for +tooth landmark detection in 3D IOS. Unlike traditional two-stage methods that +segment teeth before detecting landmarks, CHaRNet directly detects landmarks on +the input point cloud. It consists of four key modules: (1) a point cloud +encoder, (2) a point cloud decoder with a heatmap regression head, (3) a teeth +presence classification head, and (4) the innovative Conditioned Heatmap +Regression (CHaR) module. The CHaR module refines landmark regression by +leveraging teeth presence classification, enabling dynamic adaptation to cases +with missing teeth and improving accuracy in complex dental models. We evaluate +CHaRNet using five point cloud learning algorithms to validate the +effectiveness of the CHaR module and test it on a clinical dataset of 1,214 +annotated 3D dental models. Both the dataset and code will be publicly released +to address the lack of open datasets in orthodontics, promote benchmarking, and +inspire new research. CHaRNet achieves a Mean Euclidean Distance Error (MEDE) +of 1.28 mm and a Mean Success Ratio (MSR) of 82.40%, demonstrating robust +performance. Notably, it excels in handling irregular dental geometries, such +as models with missing teeth. This end-to-end approach streamlines orthodontic +workflows, improves 3D IOS analysis precision, and facilitates efficient +computer-assisted treatment planning. + +
+
+
+
+
+ + ♻ ☆ Attribution Analysis Meets Model Editing: Advancing Knowledge Correction + in Vision Language Models with VisEdit + + +
+ Model editing aims to correct outdated or erroneous knowledge in large models +without costly retraining. Recent research discovered that the mid-layer +representation of the subject's final token in a prompt has a strong influence +on factual predictions, and developed Large Language Model (LLM) editing +techniques based on this observation. However, for Vision-LLMs (VLLMs), how +visual representations impact the predictions from a decoder-only language +model remains largely unexplored. To the best of our knowledge, model editing +for VLLMs has not been extensively studied in the literature. In this work, we +employ the contribution allocation and noise perturbation methods to measure +the contributions of visual representations for token predictions. Our +attribution analysis shows that visual representations in mid-to-later layers +that are highly relevant to the prompt contribute significantly to predictions. +Based on these insights, we propose VisEdit, a novel model editor for VLLMs +that effectively corrects knowledge by editing intermediate visual +representations in regions important to the edit prompt. We evaluated VisEdit +using multiple VLLM backbones and public VLLM editing benchmark datasets. The +results show the superiority of VisEdit over the strong baselines adapted from +existing state-of-the-art editors for LLMs. + +
+
+ comment: Accepted to AAAI-2025 as an oral presentation +
+
+
+
+
+ + ♻ ☆ RORem: Training a Robust Object Remover with Human-in-the-Loop + + +
+ Despite the significant advancements, existing object removal methods +struggle with incomplete removal, incorrect content synthesis and blurry +synthesized regions, resulting in low success rates. Such issues are mainly +caused by the lack of high-quality paired training data, as well as the +self-supervised training paradigm adopted in these methods, which forces the +model to in-paint the masked regions, leading to ambiguity between synthesizing +the masked objects and restoring the background. To address these issues, we +propose a semi-supervised learning strategy with human-in-the-loop to create +high-quality paired training data, aiming to train a Robust Object Remover +(RORem). We first collect 60K training pairs from open-source datasets to train +an initial object removal model for generating removal samples, and then +utilize human feedback to select a set of high-quality object removal pairs, +with which we train a discriminator to automate the following training data +generation process. By iterating this process for several rounds, we finally +obtain a substantial object removal dataset with over 200K pairs. Fine-tuning +the pre-trained stable diffusion model with this dataset, we obtain our RORem, +which demonstrates state-of-the-art object removal performance in terms of both +reliability and image quality. Particularly, RORem improves the object removal +success rate over previous methods by more than 18\%. The dataset, source code +and trained model are available at https://github.com/leeruibin/RORem. + +
+
+
+
+
+ + ♻ ☆ Explicitly Disentangled Representations in Object-Centric Learning + + +
+ Extracting structured representations from raw visual data is an important +and long-standing challenge in machine learning. Recently, techniques for +unsupervised learning of object-centric representations have raised growing +interest. In this context, enhancing the robustness of the latent features can +improve the efficiency and effectiveness of the training of downstream tasks. A +promising step in this direction is to disentangle the factors that cause +variation in the data. Previously, Invariant Slot Attention disentangled +position, scale, and orientation from the remaining features. Extending this +approach, we focus on separating the shape and texture components. In +particular, we propose a novel architecture that biases object-centric models +toward disentangling shape and texture components into two non-overlapping +subsets of the latent space dimensions. These subsets are known a priori, hence +before the training process. Experiments on a range of object-centric +benchmarks reveal that our approach achieves the desired disentanglement while +also numerically improving baseline performance in most cases. In addition, we +show that our method can generate novel textures for a specific object or +transfer textures between objects with distinct shapes. + +
+
+ comment: Published in TMLR +
+
+
+
+
+ + ♻ ☆ Hunyuan3D 1.0: A Unified Framework for Text-to-3D and Image-to-3D + Generation + + +
+ While 3D generative models have greatly improved artists' workflows, the +existing diffusion models for 3D generation suffer from slow generation and +poor generalization. To address this issue, we propose a two-stage approach +named Hunyuan3D 1.0 including a lite version and a standard version, that both +support text- and image-conditioned generation. In the first stage, we employ a +multi-view diffusion model that efficiently generates multi-view RGB in +approximately 4 seconds. These multi-view images capture rich details of the 3D +asset from different viewpoints, relaxing the tasks from single-view to +multi-view reconstruction. In the second stage, we introduce a feed-forward +reconstruction model that rapidly and faithfully reconstructs the 3D asset +given the generated multi-view images in approximately 7 seconds. The +reconstruction network learns to handle noises and in-consistency introduced by +the multi-view diffusion and leverages the available information from the +condition image to efficiently recover the 3D structure. Our framework involves +the text-to-image model, i.e., Hunyuan-DiT, making it a unified framework to +support both text- and image-conditioned 3D generation. Our standard version +has 3x more parameters than our lite and other existing model. Our Hunyuan3D +1.0 achieves an impressive balance between speed and quality, significantly +reducing generation time while maintaining the quality and diversity of the +produced assets. + +
+
+ comment: Technical Report; 3D Generation +
+
+
+
+
+ + ♻ ☆ Eve: Efficient Multimodal Vision Language Models with Elastic Visual + Experts + + +
+ Multimodal vision language models (VLMs) have made significant progress with +the support of continuously increasing model sizes and data volumes. Running +VLMs on edge devices has become a challenge for their widespread application. +There are several efficient VLM efforts, but they often sacrifice linguistic +capabilities to enhance multimodal abilities, or require extensive training. To +address this quandary,we introduce the innovative framework of Efficient Vision +Language Models with Elastic Visual Experts (Eve). By strategically +incorporating adaptable visual expertise at multiple stages of training, Eve +strikes a balance between preserving linguistic abilities and augmenting +multimodal capabilities. This balanced approach results in a versatile model +with only 1.8B parameters that delivers significant improvements in both +multimodal and linguistic tasks. Notably, in configurations below 3B +parameters, Eve distinctly outperforms in language benchmarks and achieves +state-of-the-art results 68.87% in VLM Benchmarks. Additionally, its multimodal +accuracy outstrips that of the larger 7B LLaVA-1.5 model. Our code is available +at https://github.com/rangmiao/Eve. + +
+
+
+
+
+ + ♻ ☆ Guided Reconstruction with Conditioned Diffusion Models for Unsupervised + Anomaly Detection in Brain MRIs + + +
+ The application of supervised models to clinical screening tasks is +challenging due to the need for annotated data for each considered pathology. +Unsupervised Anomaly Detection (UAD) is an alternative approach that aims to +identify any anomaly as an outlier from a healthy training distribution. A +prevalent strategy for UAD in brain MRI involves using generative models to +learn the reconstruction of healthy brain anatomy for a given input image. As +these models should fail to reconstruct unhealthy structures, the +reconstruction errors indicate anomalies. However, a significant challenge is +to balance the accurate reconstruction of healthy anatomy and the undesired +replication of abnormal structures. While diffusion models have shown promising +results with detailed and accurate reconstructions, they face challenges in +preserving intensity characteristics, resulting in false positives. We propose +conditioning the denoising process of diffusion models with additional +information derived from a latent representation of the input image. We +demonstrate that this conditioning allows for accurate and local adaptation to +the general input intensity distribution while avoiding the replication of +unhealthy structures. We compare the novel approach to different +state-of-the-art methods and for different data sets. Our results show +substantial improvements in the segmentation performance, with the Dice score +improved by 11.9%, 20.0%, and 44.6%, for the BraTS, ATLAS and MSLUB data sets, +respectively, while maintaining competitive performance on the WMH data set. +Furthermore, our results indicate effective domain adaptation across different +MRI acquisitions and simulated contrasts, an important attribute for general +anomaly detection methods. The code for our work is available at +https://github.com/FinnBehrendt/Conditioned-Diffusion-Models-UAD + +
+
+ comment: Preprint: Accepted paper at Combuters in Biology and medicine +
+
+
+
+
+ + ♻ ☆ Robust Simultaneous Multislice MRI Reconstruction Using Deep Generative + Priors + + +
+ Simultaneous multislice (SMS) imaging is a powerful technique for +accelerating magnetic resonance imaging (MRI) acquisitions. However, SMS +reconstruction remains challenging due to complex signal interactions between +and within the excited slices. In this study, we introduce ROGER, a robust SMS +MRI reconstruction method based on deep generative priors. Utilizing denoising +diffusion probabilistic models (DDPM), ROGER begins with Gaussian noise and +gradually recovers individual slices through reverse diffusion iterations while +enforcing data consistency from measured k-space data within the readout +concatenation framework. The posterior sampling procedure is designed such that +the DDPM training can be performed on single-slice images without requiring +modifications for SMS tasks. Additionally, our method incorporates a +low-frequency enhancement (LFE) module to address the practical issue that +SMS-accelerated fast spin echo (FSE) and echo planar imaging (EPI) sequences +cannot easily embed fully-sampled autocalibration signals. Extensive +experiments on both retrospectively and prospectively accelerated datasets +demonstrate that ROGER consistently outperforms existing methods, enhancing +both anatomical and functional imaging with strong out-of-distribution +generalization. The source code and sample data for ROGER are available at +https://github.com/Solor-pikachu/ROGER. + +
+
+ comment: Submitted to Medical Image Analysis. New fMRI analysis and figures + are added since v1 +
+
+
+
+
+ + ♻ ☆ Cross-Dataset Gaze Estimation by Evidential Inter-intra Fusion ACM MM 2024 + + +
+ Achieving accurate and reliable gaze predictions in complex and diverse +environments remains challenging. Fortunately, it is straightforward to access +diverse gaze datasets in real-world applications. We discover that training +these datasets jointly can significantly improve the generalization of gaze +estimation, which is overlooked in previous works. However, due to the inherent +distribution shift across different datasets, simply mixing multiple dataset +decreases the performance in the original domain despite gaining better +generalization abilities. To address the problem of ``cross-dataset gaze +estimation'', we propose a novel Evidential Inter-intra Fusion EIF framework, +for training a cross-dataset model that performs well across all source and +unseen domains. Specifically, we build independent single-dataset branches for +various datasets where the data space is partitioned into overlapping subspaces +within each dataset for local regression, and further create a cross-dataset +branch to integrate the generalizable features from single-dataset branches. +Furthermore, evidential regressors based on the Normal and Inverse-Gamma (NIG) +distribution are designed to additionally provide uncertainty estimation apart +from predicting gaze. Building upon this foundation, our proposed framework +achieves both intra-evidential fusion among multiple local regressors within +each dataset and inter-evidential fusion among multiple branches by Mixture +\textbfof Normal Inverse-Gamma (MoNIG distribution. Experiments demonstrate +that our method consistently achieves notable improvements in both source +domains and unseen domains. + +
+
+ comment: This paper was previously submitted to ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ A Training-free Sub-quadratic Cost Transformer Model Serving Framework + With Hierarchically Pruned Attention + + +
+ In modern large language models (LLMs), increasing the context length is +crucial for improving comprehension and coherence in long-context, multi-modal, +and retrieval-augmented language generation. While many recent transformer +models attempt to extend their context length over a million tokens, they +remain impractical due to the quadratic time and space complexities. Although +recent works on linear and sparse attention mechanisms can achieve this goal, +their real-world applicability is often limited by the need to re-train from +scratch and significantly worse performance. In response, we propose a novel +approach, Hierarchically Pruned Attention (HiP), which reduces the time +complexity of the attention mechanism to $O(T \log T)$ and the space complexity +to $O(T)$, where $T$ is the sequence length. We notice a pattern in the +attention scores of pretrained LLMs where tokens close together tend to have +similar scores, which we call ``attention locality''. Based on this +observation, we utilize a novel tree-search-like algorithm that estimates the +top-$k$ key tokens for a given query on the fly, which is mathematically +guaranteed to have better performance than random attention pruning. In +addition to improving the time complexity of the attention mechanism, we +further optimize GPU memory usage by implementing KV cache offloading, which +stores only $O(\log T)$ tokens on the GPU while maintaining similar decoding +throughput. Experiments on benchmarks show that HiP, with its training-free +nature, significantly reduces both prefill and decoding latencies, as well as +memory usage, while maintaining high-quality generation with minimal +degradation. HiP enables pretrained LLMs to scale up to millions of tokens on +commodity GPUs, potentially unlocking long-context LLM applications previously +deemed infeasible. + +
+
+ comment: 44 pages +
+
+
+
+
+ + ♻ ☆ TT-BLIP: Enhancing Fake News Detection Using BLIP and Tri-Transformer + + +
+ Detecting fake news has received a lot of attention. Many previous methods +concatenate independently encoded unimodal data, ignoring the benefits of +integrated multimodal information. Also, the absence of specialized feature +extraction for text and images further limits these methods. This paper +introduces an end-to-end model called TT-BLIP that applies the bootstrapping +language-image pretraining for unified vision-language understanding and +generation (BLIP) for three types of information: BERT and BLIPTxt for text, +ResNet and BLIPImg for images, and bidirectional BLIP encoders for multimodal +information. The Multimodal Tri-Transformer fuses tri-modal features using +three types of multi-head attention mechanisms, ensuring integrated modalities +for enhanced representations and improved multimodal data analysis. The +experiments are performed using two fake news datasets, Weibo and Gossipcop. +The results indicate TT-BLIP outperforms the state-of-the-art models. + +
+
+ comment: 8 pages, Accepted 27th International Conference on Information + Fusion, FUSION 2024 +
+
+
+
+
+ + ♻ ☆ TASAR: Transfer-based Attack on Skeletal Action Recognition + + +
+ Skeletal sequences, as well-structured representations of human behaviors, +play a vital role in Human Activity Recognition (HAR). The transferability of +adversarial skeletal sequences enables attacks in real-world HAR scenarios, +such as autonomous driving, intelligent surveillance, and human-computer +interactions. However, most existing skeleton-based HAR (S-HAR) attacks are +primarily designed for white-box scenarios and exhibit weak adversarial +transferability. Therefore, they cannot be considered true transfer-based S-HAR +attacks. More importantly, the reason for this failure remains unclear. In this +paper, we study this phenomenon through the lens of loss surface, and find that +its sharpness contributes to the weak transferability in S-HAR. Inspired by +this observation, we assume and empirically validate that smoothening the +rugged loss landscape could potentially improve adversarial transferability in +S-HAR. To this end, we propose the first \textbf{T}ransfer-based +\textbf{A}ttack on \textbf{S}keletal \textbf{A}ction \textbf{R}ecognition, +TASAR. TASAR explores the smoothed model posterior without requiring surrogate +re-training, which is achieved by a new post-train Dual Bayesian optimization +strategy. Furthermore, unlike previous transfer-based attacks that treat each +frame independently and overlook temporal coherence within sequences, TASAR +incorporates motion dynamics into the Bayesian attack gradient, effectively +disrupting the spatial-temporal coherence of S-HARs. To exhaustively evaluate +the effectiveness of existing methods and our method, we build the first +large-scale robust S-HAR benchmark, comprising 7 S-HAR models, 10 attack +methods, 3 S-HAR datasets and 2 defense methods. Extensive results demonstrate +the superiority of TASAR. Our benchmark enables easy comparisons for future +studies, with the code available in the supplementary material. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2407.08572 +
+
+
+
+
+ + ♻ ☆ Rethinking Pre-Trained Feature Extractor Selection in Multiple Instance + Learning for Whole Slide Image Classification + + +
+ Multiple instance learning (MIL) has become a preferred method for gigapixel +whole slide image (WSI) classification without requiring patch-level +annotations. Current MIL research primarily relies on embedding-based +approaches, which extract patch features using a pre-trained feature extractor +and aggregate them for slide-level prediction. Despite the critical role of +feature extraction, there is limited guidance on selecting optimal feature +extractors to maximize WSI performance. This study addresses this gap by +systematically evaluating MIL feature extractors across three dimensions: +pre-training dataset, backbone model, and pre-training method. Extensive +experiments were conducted on two public WSI datasets (TCGA-NSCLC and +Camelyon16) using four state-of-the-art (SOTA) MIL models. Our findings reveal +that: 1) selecting a robust self-supervised learning (SSL) method has a greater +impact on performance than relying solely on an in-domain pre-training dataset; +2) prioritizing Transformer-based backbones with deeper architectures over +CNN-based models; and 3) using larger, more diverse pre-training datasets +significantly enhances classification outcomes. We hope that these insights can +provide practical guidance for optimizing WSI classification and explain the +reasons behind the performance advantages of the current SOTA pathology +foundation models. Furthermore, this work may inform the development of more +effective pathology foundation models. Our code is publicly available at +https://github.com/bryanwong17/MIL-Feature-Extractor-Selection + +
+
+ comment: Accepted to IEEE International Symposium on Biomedical Imaging (ISBI) + 2025 +
+
+
+
+
+ + ♻ ☆ QCS: Feature Refining from Quadruplet Cross Similarity for Facial + Expression Recognition + + +
+ Facial expression recognition faces challenges where labeled significant +features in datasets are mixed with unlabeled redundant ones. In this paper, we +introduce Cross Similarity Attention (CSA) to mine richer intrinsic information +from image pairs, overcoming a limitation when the Scaled Dot-Product Attention +of ViT is directly applied to calculate the similarity between two different +images. Based on CSA, we simultaneously minimize intra-class differences and +maximize inter-class differences at the fine-grained feature level through +interactions among multiple branches. Contrastive residual distillation is +utilized to transfer the information learned in the cross module back to the +base network. We ingeniously design a four-branch centrally symmetric network, +named Quadruplet Cross Similarity (QCS), which alleviates gradient conflicts +arising from the cross module and achieves balanced and stable training. It can +adaptively extract discriminative features while isolating redundant ones. The +cross-attention modules exist during training, and only one base branch is +retained during inference, resulting in no increase in inference time. +Extensive experiments show that our proposed method achieves state-of-the-art +performance on several FER datasets. + +
+
+
+
+
+ + ♻ ☆ Adaptive Retention & Correction for Continual Learning ICLR 2025 + + +
+ Continual learning, also known as lifelong learning or incremental learning, +refers to the process by which a model learns from a stream of incoming data +over time. A common problem in continual learning is the classification layer's +bias towards the most recent task. Traditionally, methods have relied on +incorporating data from past tasks during training to mitigate this issue. +However, the recent shift in continual learning to memory-free environments has +rendered these approaches infeasible. In this study, we propose a solution +focused on the testing phase. We first introduce a simple Out-of-Task Detection +method, OTD, designed to accurately identify samples from past tasks during +testing. Leveraging OTD, we then propose: (1) an Adaptive Retention mechanism +for dynamically tuning the classifier layer on past task data; (2) an Adaptive +Correction mechanism for revising predictions when the model classifies data +from previous tasks into classes from the current task. We name our approach +Adaptive Retention & Correction (ARC). While designed for memory-free +environments, ARC also proves effective in memory-based settings. Extensive +experiments show that our proposed method can be plugged in to virtually any +existing continual learning approach without requiring any modifications to its +training procedure. Specifically, when integrated with state-of-the-art +approaches, ARC achieves an average performance increase of 2.7% and 2.6% on +the CIFAR-100 and Imagenet-R datasets, respectively. + +
+
+ comment: Accepted to ICLR 2025 +
+
+
+
+
+ + ♻ ☆ DWCL: Dual-Weighted Contrastive Learning for Multi-View Clustering + + +
+ Multi-view contrastive clustering (MVCC) has gained significant attention for +generating consistent clustering structures from multiple views through +contrastive learning. However, most existing MVCC methods create cross-views by +combining any two views, leading to a high volume of unreliable pairs. +Furthermore, these approaches often overlook discrepancies in multi-view +representations, resulting in representation degeneration. To address these +challenges, we introduce a novel model called Dual-Weighted Contrastive +Learning (DWCL) for Multi-View Clustering. Specifically, to reduce the impact +of unreliable cross-views, we introduce an innovative Best-Other (B-O) +contrastive mechanism that enhances the representation of individual views at a +low computational cost. Furthermore, we develop a dual weighting strategy that +combines a view quality weight, reflecting the quality of each view, with a +view discrepancy weight. This approach effectively mitigates representation +degeneration by downplaying cross-views that are both low in quality and high +in discrepancy. We theoretically validate the efficiency of the B-O contrastive +mechanism and the effectiveness of the dual weighting strategy. Extensive +experiments demonstrate that DWCL outperforms previous methods across eight +multi-view datasets, showcasing superior performance and robustness in MVCC. +Specifically, our method achieves absolute accuracy improvements of 5.4\% and +5.6\% compared to state-of-the-art methods on the Caltech6V7 and MSRCv1 +datasets, respectively. + +
+
+
+
+
+ + ♻ ☆ OpenOmni: Large Language Models Pivot Zero-shot Omnimodal Alignment + across Language with Real-time Self-Aware Emotional Speech Synthesis + + +
+ Recent advancements in omnimodal learning have been achieved in understanding +and generation across images, text, and speech, though mainly within +proprietary models. Limited omnimodal datasets and the inherent challenges +associated with real-time emotional speech generation have hindered open-source +progress. To address these issues, we propose openomni, a two-stage training +method combining omnimodal alignment and speech generation to develop a +state-of-the-art omnimodal large language model. In the alignment phase, a +pre-trained speech model is further trained on text-image tasks to generalize +from vision to speech in a (near) zero-shot manner, outperforming models +trained on tri-modal datasets. In the speech generation phase, a lightweight +decoder facilitates real-time emotional speech through training on speech tasks +and preference learning. Experiments demonstrate that openomni consistently +improves across omnimodal, vision-language, and speech-language evaluations, +enabling natural, emotion-rich dialogues and real-time emotional speech +generation. + +
+
+
+
+
+ + ♻ ☆ Learning Contrastive Feature Representations for Facial Action Unit + Detection + + +
+ For the Facial Action Unit (AU) detection task, accurately capturing the +subtle facial differences between distinct AUs is essential for reliable +detection. Additionally, AU detection faces challenges from class imbalance and +the presence of noisy or false labels, which undermine detection accuracy. In +this paper, we introduce a novel contrastive learning framework aimed for AU +detection that incorporates both self-supervised and supervised signals, +thereby enhancing the learning of discriminative features for accurate AU +detection. To tackle the class imbalance issue, we employ a negative sample +re-weighting strategy that adjusts the step size of updating parameters for +minority and majority class samples. Moreover, to address the challenges posed +by noisy and false AU labels, we employ a sampling technique that encompasses +three distinct types of positive sample pairs. This enables us to inject +self-supervised signals into the supervised signal, effectively mitigating the +adverse effects of noisy labels. Our experimental assessments, conducted on +five widely-utilized benchmark datasets (BP4D, DISFA, BP4D+, GFT and +Aff-Wild2), underscore the superior performance of our approach compared to +state-of-the-art methods of AU detection. + +
+
+ comment: 35 pages, 20 figures, submitted to Pattern Recognition (PR) +
+
+
+
+
+ + ♻ ☆ A solvable generative model with a linear, one-step denoiser + + +
+ We develop an analytically tractable single-step diffusion model based on a +linear denoiser and present explicit formula for the Kullback-Leibler +divergence between generated and sampling distribution, taken to be isotropic +Gaussian, showing the effect of finite diffusion time and noise scale. Our +study further reveals that the monotonic fall phase of Kullback-Leibler +divergence begins when the training dataset size reaches the dimension of the +data points. Along the way, we provide a mathematically precise definition of +memorization to non-memorization transition when only finite number of data +points are available. It is shown that the simplified model also features this +transition during the monotonic fall phase of the aforementioned +Kullback-Leibler divergence. For large-scale practical diffusion models, we +explain why higher number of diffusion steps enhance production quality based +on the theoretical arguments presented before. In addition, we show that higher +diffusion steps does not necessarily help in reducing memorization. These two +facts combined suggests existence of an optimal number of diffusion steps for +finite number of training samples. + +
+
+ comment: 31 pages +
+
+
+
+
+ + ♻ ☆ Low-Resolution Self-Attention for Semantic Segmentation + + +
+ Semantic segmentation tasks naturally require high-resolution information for +pixel-wise segmentation and global context information for class prediction. +While existing vision transformers demonstrate promising performance, they +often utilize high-resolution context modeling, resulting in a computational +bottleneck. In this work, we challenge conventional wisdom and introduce the +Low-Resolution Self-Attention (LRSA) mechanism to capture global context at a +significantly reduced computational cost, i.e., FLOPs. Our approach involves +computing self-attention in a fixed low-resolution space regardless of the +input image's resolution, with additional 3x3 depth-wise convolutions to +capture fine details in the high-resolution space. We demonstrate the +effectiveness of our LRSA approach by building the LRFormer, a vision +transformer with an encoder-decoder structure. Extensive experiments on the +ADE20K, COCO-Stuff, and Cityscapes datasets demonstrate that LRFormer +outperforms state-of-the-art models. he code is available at +https://github.com/yuhuan-wu/LRFormer. + +
+
+ comment: added many experiments. 13 pages, 12 tables, 6 figures +
+
+
+
+
+ + ♻ ☆ Flow-Guided Diffusion for Video Inpainting + + +
+ Video inpainting has been challenged by complex scenarios like large +movements and low-light conditions. Current methods, including emerging +diffusion models, face limitations in quality and efficiency. This paper +introduces the Flow-Guided Diffusion model for Video Inpainting (FGDVI), a +novel approach that significantly enhances temporal consistency and inpainting +quality via reusing an off-the-shelf image generation diffusion model. We +employ optical flow for precise one-step latent propagation and introduces a +model-agnostic flow-guided latent interpolation technique. This technique +expedites denoising, seamlessly integrating with any Video Diffusion Model +(VDM) without additional training. Our FGDVI demonstrates a remarkable 10% +improvement in flow warping error E_warp over existing state-of-the-art +methods. Our comprehensive experiments validate superior performance of FGDVI, +offering a promising direction for advanced video inpainting. The code and +detailed results will be publicly available in +https://github.com/NevSNev/FGDVI. + +
+
+ comment: This paper has been withdrawn as a new iteration of the work has been + developed, which includes significant improvements and refinements based on + this submission. The withdrawal is made to ensure academic integrity and + compliance with publication standards. If you are interested, please refer to + the updated work at arXiv:2412.00857 +
+
+
+
+
+ + ♻ ☆ Cross-D Conv: Cross-Dimensional Transferable Knowledge Base via Fourier + Shifting Operation + + +
+ In biomedical imaging analysis, the dichotomy between 2D and 3D data presents +a significant challenge. While 3D volumes offer superior real-world +applicability, they are less available for each modality and not easy to train +in large scale, whereas 2D samples are abundant but less comprehensive. This +paper introduces Cross-D Conv operation, a novel approach that bridges the +dimensional gap by learning the phase shifting in the Fourier domain. Our +method enables seamless weight transfer between 2D and 3D convolution +operations, effectively facilitating cross-dimensional learning. The proposed +architecture leverages the abundance of 2D training data to enhance 3D model +performance, offering a practical solution to the multimodal data scarcity +challenge in 3D medical model pretraining. Experimental validation on the +RadImagenet (2D) and multimodal volumetric sets demonstrates that our approach +achieves comparable or superior performance in feature quality assessment. The +enhanced convolution operation presents new opportunities for developing +efficient classification and segmentation models in medical imaging. This work +represents an advancement in cross-dimensional and multimodal medical image +analysis, offering a robust framework for utilizing 2D priors in 3D model +pretraining while maintaining computational efficiency of 2D training. + +
+
+ comment: Accepted for ISBI25; Codes&Weights: + https://github.com/convergedmachine/Cross-D-Conv +
+
+
+
+
+ + ♻ ☆ SpatialCoT: Advancing Spatial Reasoning through Coordinate Alignment and + Chain-of-Thought for Embodied Task Planning + + +
+ Spatial reasoning is an essential problem in embodied AI research. Efforts to +enhance spatial reasoning abilities through supplementary spatial data and +fine-tuning have proven limited and ineffective when addressing complex +embodied tasks, largely due to their dependence on language-based outputs. +While some approaches have introduced a point-based action space to mitigate +this issue, they fall short in managing more intricate tasks within complex +environments. This deficiency arises from their failure to fully exploit the +inherent thinking and reasoning capabilities that are fundamental strengths of +Vision-Language Models (VLMs). To address these limitations, we propose a novel +approach named SpatialCoT, specifically designed to bolster the spatial +reasoning capabilities of VLMs. Our approach comprises two stages: spatial +coordinate bi-directional alignment, which aligns vision-language inputs with +spatial coordinates, and chain-of-thought spatial grounding, which harnesses +the reasoning capabilities of language models for advanced spatial reasoning. +We evaluate SpatialCoT on challenging navigation and manipulation tasks, both +in simulation and real-world settings. Experimental results demonstrate that +our method significantly outperforms previous state-of-the-art approaches in +both tasks. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ Boosting Diffusion Guidance via Learning Degradation-Aware Models for + Blind Super Resolution + + +
+ Recently, diffusion-based blind super-resolution (SR) methods have shown +great ability to generate high-resolution images with abundant high-frequency +detail, but the detail is often achieved at the expense of fidelity. Meanwhile, +another line of research focusing on rectifying the reverse process of +diffusion models (i.e., diffusion guidance), has demonstrated the power to +generate high-fidelity results for non-blind SR. However, these methods rely on +known degradation kernels, making them difficult to apply to blind SR. To +address these issues, we present DADiff in this paper. DADiff incorporates +degradation-aware models into the diffusion guidance framework, eliminating the +need to know degradation kernels. Additionally, we propose two novel +techniques: input perturbation and guidance scalar, to further improve our +performance. Extensive experimental results show that our proposed method has +superior performance over state-of-the-art methods on blind SR benchmarks. + +
+
+ comment: To appear in WACV 2025. Code is available at: + https://github.com/ryanlu2240/DADiff +
+
+
+
+
+ + ♻ ☆ MeshLRM: Large Reconstruction Model for High-Quality Meshes + + +
+ We propose MeshLRM, a novel LRM-based approach that can reconstruct a +high-quality mesh from merely four input images in less than one second. +Different from previous large reconstruction models (LRMs) that focus on +NeRF-based reconstruction, MeshLRM incorporates differentiable mesh extraction +and rendering within the LRM framework. This allows for end-to-end mesh +reconstruction by fine-tuning a pre-trained NeRF LRM with mesh rendering. +Moreover, we improve the LRM architecture by simplifying several complex +designs in previous LRMs. MeshLRM's NeRF initialization is sequentially trained +with low- and high-resolution images; this new LRM training strategy enables +significantly faster convergence and thereby leads to better quality with less +compute. Our approach achieves state-of-the-art mesh reconstruction from +sparse-view inputs and also allows for many downstream applications, including +text-to-3D and single-image-to-3D generation. Project page: +https://sarahweiii.github.io/meshlrm/ + +
+
+
+
+
+ + ♻ MOS-Attack: A Scalable Multi-objective Adversarial Attack Framework CVPR 2025 + + +
+ Crafting adversarial examples is crucial for evaluating and enhancing the +robustness of Deep Neural Networks (DNNs), presenting a challenge equivalent to +maximizing a non-differentiable 0-1 loss function. + However, existing single objective methods, namely adversarial attacks focus +on a surrogate loss function, do not fully harness the benefits of engaging +multiple loss functions, as a result of insufficient understanding of their +synergistic and conflicting nature. + To overcome these limitations, we propose the Multi-Objective Set-based +Attack (MOS Attack), a novel adversarial attack framework leveraging multiple +loss functions and automatically uncovering their interrelations. + The MOS Attack adopts a set-based multi-objective optimization strategy, +enabling the incorporation of numerous loss functions without additional +parameters. + It also automatically mines synergistic patterns among various losses, +facilitating the generation of potent adversarial attacks with fewer +objectives. + Extensive experiments have shown that our MOS Attack outperforms +single-objective attacks. Furthermore, by harnessing the identified synergistic +patterns, MOS Attack continues to show superior results with a reduced number +of loss functions. + +
+
+ comment: Under Review of CVPR 2025 +
+
+
+
+
+ + ♻ ☆ Go-with-the-Flow: Motion-Controllable Video Diffusion Models Using + Real-Time Warped Noise + + +
+ Generative modeling aims to transform random noise into structured outputs. +In this work, we enhance video diffusion models by allowing motion control via +structured latent noise sampling. This is achieved by just a change in data: we +pre-process training videos to yield structured noise. Consequently, our method +is agnostic to diffusion model design, requiring no changes to model +architectures or training pipelines. Specifically, we propose a novel noise +warping algorithm, fast enough to run in real time, that replaces random +temporal Gaussianity with correlated warped noise derived from optical flow +fields, while preserving the spatial Gaussianity. The efficiency of our +algorithm enables us to fine-tune modern video diffusion base models using +warped noise with minimal overhead, and provide a one-stop solution for a wide +range of user-friendly motion control: local object motion control, global +camera movement control, and motion transfer. The harmonization between +temporal coherence and spatial Gaussianity in our warped noise leads to +effective motion control while maintaining per-frame pixel quality. Extensive +experiments and user studies demonstrate the advantages of our method, making +it a robust and scalable approach for controlling motion in video diffusion +models. Video results are available on our webpage: +https://eyeline-research.github.io/Go-with-the-Flow. Source code and model +checkpoints are available on GitHub: +https://github.com/Eyeline-Research/Go-with-the-Flow. + +
+
+
+
+
+ + ♻ ☆ Precise and Robust Sidewalk Detection: Leveraging Ensemble Learning to + Surpass LLM Limitations in Urban Environments + + +
+ This study aims to compare the effectiveness of a robust ensemble model with +the state-of-the-art ONE-PEACE Large Language Model (LLM) for accurate +detection of sidewalks. Accurate sidewalk detection is crucial in improving +road safety and urban planning. The study evaluated the model's performance on +Cityscapes, Ade20k, and the Boston Dataset. The results showed that the +ensemble model performed better than the individual models, achieving mean +Intersection Over Union (mIOU) scores of 93.1\%, 90.3\%, and 90.6\% on these +datasets under ideal conditions. Additionally, the ensemble model maintained a +consistent level of performance even in challenging conditions such as +Salt-and-Pepper and Speckle noise, with only a gradual decrease in efficiency +observed. On the other hand, the ONE-PEACE LLM performed slightly better than +the ensemble model in ideal scenarios but experienced a significant decline in +performance under noisy conditions. These findings demonstrate the robustness +and reliability of the ensemble model, making it a valuable asset for improving +urban infrastructure related to road safety and curb space management. This +study contributes positively to the broader context of urban health and +mobility. + +
+
+
+
+
+ + ♻ ☆ Sim-to-Real Domain Adaptation for Deformation Classification + + +
+ Deformation detection is vital for enabling accurate assessment and +prediction of structural changes in materials, ensuring timely and effective +interventions to maintain safety and integrity. Automating deformation +detection through computer vision is crucial for efficient monitoring, but it +faces significant challenges in creating a comprehensive dataset of both +deformed and non-deformed objects, which can be difficult to obtain in many +scenarios. In this paper, we introduce a novel framework for generating +controlled synthetic data that simulates deformed objects. This approach allows +for the realistic modeling of object deformations under various conditions. Our +framework integrates an intelligent adapter network that facilitates +sim-to-real domain adaptation, enhancing classification results without +requiring real data from deformed objects. We conduct experiments on domain +adaptation and classification tasks and demonstrate that our framework improves +sim-to-real classification results compared to simulation baseline. + +
+
+ comment: 7 pages, 5 figures, submitted to SMC +
+
+
+
+
+ + ♻ ☆ Progressive Token Length Scaling in Transformer Encoders for Efficient + Universal Segmentation ICLR 2025 + + +
+ A powerful architecture for universal segmentation relies on transformers +that encode multi-scale image features and decode object queries into mask +predictions. With efficiency being a high priority for scaling such models, we +observed that the state-of-the-art method Mask2Former uses 50% of its compute +only on the transformer encoder. This is due to the retention of a full-length +token-level representation of all backbone feature scales at each encoder +layer. With this observation, we propose a strategy termed PROgressive Token +Length SCALing for Efficient transformer encoders (PRO-SCALE) that can be +plugged-in to the Mask2Former segmentation architecture to significantly reduce +the computational cost. The underlying principle of PRO-SCALE is: progressively +scale the length of the tokens with the layers of the encoder. This allows +PRO-SCALE to reduce computations by a large margin with minimal sacrifice in +performance (~52% encoder and ~27% overall GFLOPs reduction with no drop in +performance on COCO dataset). Experiments conducted on public benchmarks +demonstrates PRO-SCALE's flexibility in architectural configurations, and +exhibits potential for extension beyond the settings of segmentation tasks to +encompass object detection. Code here: +https://github.com/abhishekaich27/proscale-pytorch + +
+
+ comment: Accepted to ICLR 2025 +
+
+
+
+
+ + ♻ ☆ POLAR-Sim: Augmenting NASA's POLAR Dataset for Data-Driven Lunar + Perception and Rover Simulation + + +
+ NASA's POLAR dataset contains approximately 2,600 pairs of high dynamic range +stereo photos captured across 13 varied terrain scenarios, including areas with +sparse or dense rock distributions, craters, and rocks of different sizes. The +purpose of these photos is to spur development in robotics, AI-based +perception, and autonomous navigation. Acknowledging a scarcity of lunar images +from around the lunar poles, NASA Ames produced on Earth but in controlled +conditions images that resemble rover operating conditions from these regions +of the Moon. We report on the outcomes of an effort aimed at accomplishing two +tasks. In Task 1, we provided bounding boxes and semantic segmentation +information for all the images in NASA's POLAR dataset. This effort resulted in +23,000 labels and semantic segmentation annotations pertaining to rocks, +shadows, and craters. In Task 2, we generated the digital twins of the 13 +scenarios that have been used to produce all the photos in the POLAR dataset. +Specifically, for each of these scenarios, we produced individual meshes, +texture information, and material properties associated with the ground and the +rocks in each scenario. This allows anyone with a camera model to synthesize +images associated with any of the 13 scenarios of the POLAR dataset. +Effectively, one can generate as many semantically labeled synthetic images as +desired -- with different locations and exposure values in the scene, for +different positions of the sun, with or without the presence of active +illumination, etc. The benefit of this work is twofold. Using outcomes of Task +1, one can train and/or test perception algorithms that deal with Moon images. +For Task 2, one can produce as much data as desired to train and test AI +algorithms that are anticipated to work in lunar conditions. All the outcomes +of this work are available in a public repository for unfettered use and +distribution. + +
+
+ comment: 11 pages, 9 figures. This work has been submitted to the IEEE for + possible publication +
+
+
+
+
+ + ♻ ☆ Utilizing Large Language Models in an iterative paradigm with domain + feedback for zero-shot molecule optimization + + +
+ Molecule optimization is a critical task in drug discovery to optimize +desired properties of a given molecule. Despite Large Language Models (LLMs) +holding the potential to efficiently simulate this task by using natural +language to direct the optimization, straightforwardly utilizing them shows +limited performance. In this work, we facilitate utilizing LLMs in an iterative +paradigm by proposing a simple yet effective domain feedback provider, namely +$\text{Re}^2$DF. In detail, $\text{Re}^2$DF harnesses an external toolkit, +RDKit, to handle the molecule hallucination, if the modified molecule is +chemically invalid. Otherwise, $\text{Re}^2$DF verifies whether the modified +molecule meets the objective, if not, its desired properties are computed and +compared to the original one, establishing reliable domain feedback with +correct direction and distance towards the objective to explicitly guide the +LLM to refine the modified molecule. We conduct experiments across both single- +and multi-property objectives with 2 thresholds, where $\text{Re}^2$DF shows +significant improvements. Notably, for 20 single-property objectives, +$\text{Re}^2$DF enhances Hit ratio by 16.96% and 20.76% under loose +(\texttt{l}) and strict (\texttt{s}) thresholds, respectively. For 32 +multi-property objectives, $\text{Re}^2$DF enhances Hit ratio by 6.04% and +5.25%. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 22 + +
+
+
+ + ☆ AdaWM: Adaptive World Model based Planning for Autonomous Driving ICLR 2025 + + +
+ World model based reinforcement learning (RL) has emerged as a promising +approach for autonomous driving, which learns a latent dynamics model and uses +it to train a planning policy. To speed up the learning process, the +pretrain-finetune paradigm is often used, where online RL is initialized by a +pretrained model and a policy learned offline. However, naively performing such +initialization in RL may result in dramatic performance degradation during the +online interactions in the new task. To tackle this challenge, we first analyze +the performance degradation and identify two primary root causes therein: the +mismatch of the planning policy and the mismatch of the dynamics model, due to +distribution shift. We further analyze the effects of these factors on +performance degradation during finetuning, and our findings reveal that the +choice of finetuning strategies plays a pivotal role in mitigating these +effects. We then introduce AdaWM, an Adaptive World Model based planning +method, featuring two key steps: (a) mismatch identification, which quantifies +the mismatches and informs the finetuning strategy, and (b) alignment-driven +finetuning, which selectively updates either the policy or the model as needed +using efficient low-rank updates. Extensive experiments on the challenging +CARLA driving tasks demonstrate that AdaWM significantly improves the +finetuning process, resulting in more robust and efficient performance in +autonomous driving systems. + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ☆ Drone Carrier: An Integrated Unmanned Surface Vehicle for Autonomous + Inspection and Intervention in GNSS-Denied Maritime Environment + + +
+ This paper introduces an innovative drone carrier concept that is applied in +maritime port security or offshore rescue. This system works with a +heterogeneous system consisting of multiple Unmanned Aerial Vehicles (UAVs) and +Unmanned Surface Vehicles (USVs) to perform inspection and intervention tasks +in GNSS-denied or interrupted environments. The carrier, an electric catamaran +measuring 4m by 7m, features a 4m by 6m deck supporting automated takeoff and +landing for four DJI M300 drones, along with a 10kg-payload manipulator +operable in up to level 3 sea conditions. Utilizing an offshore gimbal camera +for navigation, the carrier can autonomously navigate, approach and dock with +non-cooperative vessels, guided by an onboard camera, LiDAR, and Doppler +Velocity Log (DVL) over a 3 km$^2$ area. UAVs equipped with onboard +Ultra-Wideband (UWB) technology execute mapping, detection, and manipulation +tasks using a versatile gripper designed for wet, saline conditions. +Additionally, two UAVs can coordinate to transport large objects to the +manipulator or interact directly with them. These procedures are fully +automated and were successfully demonstrated at the Mohammed Bin Zayed +International Robotic Competition (MBZIRC2024), where the drone carrier +equipped with four UAVS and one manipulator, automatically accomplished the +intervention tasks in sea-level-3 (wave height 1.25m) based on the rough target +information. + +
+
+ comment: 15 pages, 12pages +
+
+
+
+
+ + ☆ PSGSL: A Probabilistic Framework Integrating Semantic Scene + Understanding and Gas Sensing for Gas Source Localization + + +
+ Semantic scene understanding allows a robotic agent to reason about problems +in complex ways, using information from multiple and varied sensors to make +deductions about a particular matter. As a result, this form of intelligent +robotics is capable of performing more complex tasks and achieving more precise +results than simpler approaches based on single data sources. However, these +improved capabilities come at the cost of higher complexity, both computational +and in terms of design. Due to the increased design complexity, formal +approaches for exploiting semantic understanding become necessary. + We present here a probabilistic formulation for integrating semantic +knowledge into the process of gas source localization (GSL). The problem of GSL +poses many unsolved challenges, and proposed solutions need to contend with the +constraining limitations of sensing hardware. By exploiting semantic scene +understanding, we can leverage other sources of information, such as vision, to +improve the estimation of the source location. We show how our formulation can +be applied to pre-existing GSL algorithms and the effect that including +semantic data has on the produced estimations of the location of the source. + +
+
+
+
+
+ + ☆ Int2Planner: An Intention-based Multi-modal Motion Planner for + Integrated Prediction and Planning + + +
+ Motion planning is a critical module in autonomous driving, with the primary +challenge of uncertainty caused by interactions with other participants. As +most previous methods treat prediction and planning as separate tasks, it is +difficult to model these interactions. Furthermore, since the route path +navigates ego vehicles to a predefined destination, it provides relatively +stable intentions for ego vehicles and helps constrain uncertainty. On this +basis, we construct Int2Planner, an \textbf{Int}ention-based +\textbf{Int}egrated motion \textbf{Planner} achieves multi-modal planning and +prediction. Instead of static intention points, Int2Planner utilizes route +intention points for ego vehicles and generates corresponding planning +trajectories for each intention point to facilitate multi-modal planning. The +experiments on the private dataset and the public nuPlan benchmark show the +effectiveness of route intention points, and Int2Planner achieves +state-of-the-art performance. We also deploy it in real-world vehicles and have +conducted autonomous driving for hundreds of kilometers in urban areas. It +further verifies that Int2Planner can continuously interact with the traffic +environment. Code will be avaliable at https://github.com/cxlz/Int2Planner. + +
+
+
+
+
+ + ☆ Grid-based Submap Joining: An Efficient Algorithm for Simultaneously + Optimizing Global Occupancy Map and Local Submap Frames IROS 2024 + + +
+ Optimizing robot poses and the map simultaneously has been shown to provide +more accurate SLAM results. However, for non-feature based SLAM approaches, +directly optimizing all the robot poses and the whole map will greatly increase +the computational cost, making SLAM problems difficult to solve in large-scale +environments. To solve the 2D non-feature based SLAM problem in large-scale +environments more accurately and efficiently, we propose the grid-based submap +joining method. Specifically, we first formulate the 2D grid-based submap +joining problem as a non-linear least squares (NLLS) form to optimize the +global occupancy map and local submap frames simultaneously. We then prove that +in solving the NLLS problem using Gauss-Newton (GN) method, the increments of +the poses in each iteration are independent of the occupancy values of the +global occupancy map. Based on this property, we propose a poseonly GN +algorithm equivalent to full GN method to solve the NLLS problem. The proposed +submap joining algorithm is very efficient due to the independent property and +the pose-only solution. Evaluations using simulations and publicly available +practical 2D laser datasets confirm the outperformance of our proposed method +compared to the state-of-the-art methods in terms of efficiency and accuracy, +as well as the ability to solve the grid-based SLAM problem in very large-scale +environments. + +
+
+ comment: Accepted by IROS 2024 +
+
+
+
+
+ + ☆ AnyNav: Visual Neuro-Symbolic Friction Learning for Off-road Navigation + + +
+ Off-road navigation is essential for a wide range of applications in field +robotics such as planetary exploration and disaster response. However, it +remains an unresolved challenge due to the unstructured environments and +inherent complexity of terrain-vehicle interactions. Traditional physics-based +methods struggle to accurately model the nonlinear dynamics of these +interactions, while data-driven approaches often suffer from overfitting to +specific motion patterns, vehicle sizes, and types, limiting their +generalizability. To overcome these challenges, we introduce a vision-based +friction estimation framework grounded in neuro-symbolic principles, +integrating neural networks for visual perception with symbolic reasoning for +physical modeling. This enables significantly improved generalization abilities +through explicit physical reasoning incorporating the predicted friction. +Additionally, we develop a physics-informed planner that leverages the learned +friction coefficient to generate physically feasible and efficient paths, along +with corresponding speed profiles. We refer to our approach as AnyNav and +evaluate it in both simulation and real-world experiments, demonstrating its +utility and robustness across various off-road scenarios and multiple types of +four-wheeled vehicles. These results mark an important step toward developing +neuro-symbolic spatial intelligence to reason about complex, unstructured +environments and enable autonomous off-road navigation in challenging +scenarios. Video demonstrations are available at https://sairlab.org/anynav/, +where the source code will also be released. + +
+
+
+
+
+ + ☆ A 3-Step Optimization Framework with Hybrid Models for a Humanoid + Robot's Jump Motion + + +
+ High dynamic jump motions are challenging tasks for humanoid robots to +achieve environment adaptation and obstacle crossing. The trajectory +optimization is a practical method to achieve high-dynamic and explosive +jumping. This paper proposes a 3-step trajectory optimization framework for +generating a jump motion for a humanoid robot. To improve iteration speed and +achieve ideal performance, the framework comprises three sub-optimizations. The +first optimization incorporates momentum, inertia, and center of pressure +(CoP), treating the robot as a static reaction momentum pendulum (SRMP) model +to generate corresponding trajectories. The second optimization maps these +trajectories to joint space using effective Quadratic Programming (QP) solvers. +Finally, the third optimization generates whole-body joint trajectories +utilizing trajectories generated by previous parts. With the combined +consideration of momentum and inertia, the robot achieves agile forward jump +motions. A simulation and experiments (Fig. \ref{Fig First page fig}) of +forward jump with a distance of 1.0 m and 0.5 m height are presented in this +paper, validating the applicability of the proposed framework. + +
+
+
+
+
+ + ☆ "See You Later, Alligator": Impacts of Robot Small Talk on Task, + Rapport, and Interaction Dynamics in Human-Robot Collaboration + + +
+ Small talk can foster rapport building in human-human teamwork; yet how +non-anthropomorphic robots, such as collaborative manipulators commonly used in +industry, may capitalize on these social communications remains unclear. This +work investigates how robot-initiated small talk influences task performance, +rapport, and interaction dynamics in human-robot collaboration. We developed an +autonomous robot system that assists a human in an assembly task while +initiating and engaging in small talk. A user study ($N = 58$) was conducted in +which participants worked with either a functional robot, which engaged in only +task-oriented speech, or a social robot, which also initiated small talk. Our +study found that participants in the social condition reported significantly +higher levels of rapport with the robot. Moreover, all participants in the +social condition responded to the robot's small talk attempts; 59% initiated +questions to the robot, and 73% engaged in lingering conversations after +requesting the final task item. Although active working times were similar +across conditions, participants in the social condition recorded longer task +durations than those in the functional condition. We discuss the design and +implications of robot small talk in shaping human-robot collaboration. + +
+
+ comment: 8 pages, 4 figures, preprint for HRI25, the 20th edition of the + IEEE/ACM International Conference on Human-Robot Interaction +
+
+
+
+
+ + ☆ Safe and Efficient Robot Action Planning in the Presence of Unconcerned + Humans + + +
+ This paper proposes a robot action planning scheme that provides an efficient +and probabilistically safe plan for a robot interacting with an unconcerned +human -- someone who is either unaware of the robot's presence or unwilling to +engage in ensuring safety. The proposed scheme is predictive, meaning that the +robot is required to predict human actions over a finite future horizon; such +predictions are often inaccurate in real-world scenarios. One possible approach +to reduce the uncertainties is to provide the robot with the capability of +reasoning about the human's awareness of potential dangers. This paper +discusses that by using a binary variable, so-called danger awareness +coefficient, it is possible to differentiate between concerned and unconcerned +humans, and provides a learning algorithm to determine this coefficient by +observing human actions. Moreover, this paper argues how humans rely on +predictions of other agents' future actions (including those of robots in +human-robot interaction) in their decision-making. It also shows that ignoring +this aspect in predicting human's future actions can significantly degrade the +efficiency of the interaction, causing agents to deviate from their optimal +paths. The proposed robot action planning scheme is verified and validated via +extensive simulation and experimental studies on a LoCoBot WidowX-250. + +
+
+
+
+
+ + ☆ Polyhedral Collision Detection via Vertex Enumeration + + +
+ Collision detection is a critical functionality for robotics. The degree to +which objects collide cannot be represented as a continuously differentiable +function for any shapes other than spheres. This paper proposes a framework for +handling collision detection between polyhedral shapes. We frame the signed +distance between two polyhedral bodies as the optimal value of a convex +optimization, and consider constraining the signed distance in a bilevel +optimization problem. To avoid relying on specialized bilevel solvers, our +method exploits the fact that the signed distance is the minimal point of a +convex region related to the two bodies. Our method enumerates the values +obtained at all extreme points of this region and lists them as constraints in +the higher-level problem. We compare our formulation to existing methods in +terms of reliability and speed when solved using the same mixed complementarity +problem solver. We demonstrate that our approach more reliably solves difficult +collision detection problems with multiple obstacles than other methods, and is +faster than existing methods in some cases. + +
+
+
+
+
+ + ☆ Map Prediction and Generative Entropy for Multi-Agent Exploration + + +
+ Traditionally, autonomous reconnaissance applications have acted on explicit +sets of historical observations. Aided by recent breakthroughs in generative +technologies, this work enables robot teams to act beyond what is currently +known about the environment by inferring a distribution of reasonable +interpretations of the scene. We developed a map predictor that inpaints the +unknown space in a multi-agent 2D occupancy map during an exploration mission. +From a comparison of several inpainting methods, we found that a fine-tuned +latent diffusion inpainting model could provide rich and coherent +interpretations of simulated urban environments with relatively little +computation time. By iteratively inferring interpretations of the scene +throughout an exploration run, we are able to identify areas that exhibit high +uncertainty in the prediction, which we formalize with the concept of +generative entropy. We prioritize tasks in regions of high generative entropy, +hypothesizing that this will expedite convergence on an accurate predicted map +of the scene. In our study we juxtapose this new paradigm of task ranking with +the state of the art, which ranks regions to explore by those which maximize +expected information recovery. We compare both of these methods in a simulated +urban environment with three vehicles. Our results demonstrate that by using +our new task ranking method, we can predict a correct scene significantly +faster than with a traditional information-guided method. + +
+
+
+
+
+ + ☆ A Hierarchical Reinforcement Learning Framework for Multi-UAV Combat + Using Leader-Follower Strategy + + +
+ Multi-UAV air combat is a complex task involving multiple autonomous UAVs, an +evolving field in both aerospace and artificial intelligence. This paper aims +to enhance adversarial performance through collaborative strategies. Previous +approaches predominantly discretize the action space into predefined actions, +limiting UAV maneuverability and complex strategy implementation. Others +simplify the problem to 1v1 combat, neglecting the cooperative dynamics among +multiple UAVs. To address the high-dimensional challenges inherent in +six-degree-of-freedom space and improve cooperation, we propose a hierarchical +framework utilizing the Leader-Follower Multi-Agent Proximal Policy +Optimization (LFMAPPO) strategy. Specifically, the framework is structured into +three levels. The top level conducts a macro-level assessment of the +environment and guides execution policy. The middle level determines the angle +of the desired action. The bottom level generates precise action commands for +the high-dimensional action space. Moreover, we optimize the state-value +functions by assigning distinct roles with the leader-follower strategy to +train the top-level policy, followers estimate the leader's utility, promoting +effective cooperation among agents. Additionally, the incorporation of a target +selector, aligned with the UAVs' posture, assesses the threat level of targets. +Finally, simulation experiments validate the effectiveness of our proposed +method. + +
+
+
+
+
+ + ☆ A Spatio-temporal Graph Network Allowing Incomplete Trajectory Input for + Pedestrian Trajectory Prediction + + +
+ Pedestrian trajectory prediction is important in the research of mobile robot +navigation in environments with pedestrians. Most pedestrian trajectory +prediction algorithms require the input historical trajectories to be complete. +If a pedestrian is unobservable in any frame in the past, then its historical +trajectory become incomplete, the algorithm will not predict its future +trajectory. To address this limitation, we propose the STGN-IT, a +spatio-temporal graph network allowing incomplete trajectory input, which can +predict the future trajectories of pedestrians with incomplete historical +trajectories. STGN-IT uses the spatio-temporal graph with an additional +encoding method to represent the historical trajectories and observation states +of pedestrians. Moreover, STGN-IT introduces static obstacles in the +environment that may affect the future trajectories as nodes to further improve +the prediction accuracy. A clustering algorithm is also applied in the +construction of spatio-temporal graphs. Experiments on public datasets show +that STGN-IT outperforms state of the art algorithms on these metrics. + +
+
+
+
+
+ + ♻ ☆ Fast Ergodic Search with Kernel Functions + + +
+ Ergodic search enables optimal exploration of an information distribution +while guaranteeing the asymptotic coverage of the search space. However, +current methods typically have exponential computation complexity in the search +space dimension and are restricted to Euclidean space. We introduce a +computationally efficient ergodic search method. Our contributions are +two-fold. First, we develop a kernel-based ergodic metric and generalize it +from Euclidean space to Lie groups. We formally prove the proposed metric is +consistent with the standard ergodic metric while guaranteeing linear +complexity in the search space dimension. Secondly, we derive the first-order +optimality condition of the kernel ergodic metric for nonlinear systems, which +enables efficient trajectory optimization. Comprehensive numerical benchmarks +show that the proposed method is at least two orders of magnitude faster than +the state-of-the-art algorithm. Finally, we demonstrate the proposed algorithm +with a peg-in-hole insertion task. We formulate the problem as a coverage task +in the space of SE(3) and use a 30-second-long human demonstration as the prior +distribution for ergodic coverage. Ergodicity guarantees the asymptotic +solution of the peg-in-hole problem so long as the solution resides within the +prior information distribution, which is seen in the 100% success rate. + +
+
+ comment: Accepted to IEEE Transactions on Robotics (T-RO). 20 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ A Data-driven Contact Estimation Method for Wheeled-Biped Robots + + +
+ Contact estimation is a key ability for limbed robots, where making and +breaking contacts has a direct impact on state estimation and balance control. +Existing approaches typically rely on gate-cycle priors or designated contact +sensors. We design a contact estimator that is suitable for the emerging +wheeled-biped robot types that do not have these features. To this end, we +propose a Bayes filter in which update steps are learned from real-robot torque +measurements while prediction steps rely on inertial measurements. We evaluate +this approach in extensive real-robot and simulation experiments. Our method +achieves better performance while being considerably more sample efficient than +a comparable deep-learning baseline. + +
+
+
+
+
+ + ♻ ☆ SpatialCoT: Advancing Spatial Reasoning through Coordinate Alignment and + Chain-of-Thought for Embodied Task Planning + + +
+ Spatial reasoning is an essential problem in embodied AI research. Efforts to +enhance spatial reasoning abilities through supplementary spatial data and +fine-tuning have proven limited and ineffective when addressing complex +embodied tasks, largely due to their dependence on language-based outputs. +While some approaches have introduced a point-based action space to mitigate +this issue, they fall short in managing more intricate tasks within complex +environments. This deficiency arises from their failure to fully exploit the +inherent thinking and reasoning capabilities that are fundamental strengths of +Vision-Language Models (VLMs). To address these limitations, we propose a novel +approach named SpatialCoT, specifically designed to bolster the spatial +reasoning capabilities of VLMs. Our approach comprises two stages: spatial +coordinate bi-directional alignment, which aligns vision-language inputs with +spatial coordinates, and chain-of-thought spatial grounding, which harnesses +the reasoning capabilities of language models for advanced spatial reasoning. +We evaluate SpatialCoT on challenging navigation and manipulation tasks, both +in simulation and real-world settings. Experimental results demonstrate that +our method significantly outperforms previous state-of-the-art approaches in +both tasks. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ Design Optimizer for Soft Growing Robot Manipulators in + Three-Dimensional Environments + + +
+ Soft growing robots are novel devices that mimic plant-like growth for +navigation in cluttered or dangerous environments. Their ability to adapt to +surroundings, combined with advancements in actuation and manufacturing +technologies, allows them to perform specialized manipulation tasks. This work +presents an approach for design optimization of soft growing robots; +specifically, the three-dimensional extension of the optimizer designed for +planar manipulators. This tool is intended to be used by engineers and robot +enthusiasts before manufacturing their robot: it suggests the optimal size of +the robot for solving a specific task. The design process models a +multi-objective optimization problem to refine a soft manipulator's kinematic +chain. Thanks to the novel Rank Partitioning algorithm integrated into +Evolutionary Computation (EC) algorithms, this method achieves high precision +in reaching targets and is efficient in resource usage. Results show +significantly high performance in solving three-dimensional tasks, whereas +comparative experiments indicate that the optimizer features robust output when +tested with different EC algorithms, particularly genetic algorithms. + +
+
+ comment: 17 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ TelePreview: A User-Friendly Teleoperation System with Virtual Arm + Assistance for Enhanced Effectiveness + + +
+ Teleoperation provides an effective way to collect robot data, which is +crucial for learning from demonstrations. In this field, teleoperation faces +several key challenges: user-friendliness for new users, safety assurance, and +transferability across different platforms. While collecting real robot +dexterous manipulation data by teleoperation to train robots has shown +impressive results on diverse tasks, due to the morphological differences +between human and robot hands, it is not only hard for new users to understand +the action mapping but also raises potential safety concerns during operation. +To address these limitations, we introduce TelePreview. This teleoperation +system offers real-time visual feedback on robot actions based on human user +inputs, with a total hardware cost of less than $1,000. TelePreview allows the +user to see a virtual robot that represents the outcome of the user's next +movement. By enabling flexible switching between command visualization and +actual execution, this system helps new users learn how to demonstrate quickly +and safely. We demonstrate that it outperforms other teleoperation systems +across five tasks, emphasize its ease of use, and highlight its straightforward +deployment across diverse robotic platforms. We release our code and a +deployment document on our website https://telepreview.github.io. + +
+
+ comment: In submission +
+
+
+
+
+ + ♻ SLIM: Sim-to-Real Legged Instructive Manipulation via Long-Horizon + Visuomotor Learning + + +
+ We present a low-cost legged mobile manipulation system that solves +long-horizon real-world tasks, trained by reinforcement learning purely in +simulation. This system is made possible by 1) a hierarchical design of a +high-level policy for visual-mobile manipulation following instructions and a +low-level policy for quadruped movement and limb control, 2) a progressive +exploration and learning approach that leverages privileged task decomposition +information to train the teacher policy for long-horizon tasks, which will +guide an imitation-based student policy for efficient training of the +high-level visuomotor policy, and 3) a suite of techniques for minimizing +sim-to-real gaps. + In contrast to previous approaches that use high-end equipment, our system +demonstrates effective performance with more accessible hardware - +specifically, a Unitree Go1 quadruped, a WidowX250S arm, and a single +wrist-mounted RGB camera - despite the increased challenges of sim-to-real +transfer. When fully trained in simulation, a single policy autonomously solves +long-horizon tasks such as search, move, grasp, and drop-into, achieving nearly +80% success. This performance is comparable to that of expert human +teleoperation on the same tasks but operates in a more efficient way, at 1.5 +times the speed of human expert. The sim-to-real transfer is fluid across +diverse indoor and outdoor scenes under varying lighting conditions. Finally, +we discuss the key techniques that enable the entire pipeline, including +efficient RL training and sim-to-real, to work effectively for legged mobile +manipulation, and present their ablation results. + +
+
+
+
+
+ + ♻ ☆ Offline-to-online Reinforcement Learning for Image-based Grasping with + Scarce Demonstrations + + +
+ Offline-to-online reinforcement learning (O2O RL) aims to obtain a +continually improving policy as it interacts with the environment, while +ensuring the initial policy behaviour is satisficing. This satisficing +behaviour is necessary for robotic manipulation where random exploration can be +costly due to catastrophic failures and time. O2O RL is especially compelling +when we can only obtain a scarce amount of (potentially suboptimal) +demonstrations$\unicode{x2014}$a scenario where behavioural cloning (BC) is +known to suffer from distribution shift. Previous works have outlined the +challenges in applying O2O RL algorithms under the image-based environments. In +this work, we propose a novel O2O RL algorithm that can learn in a real-life +image-based robotic vacuum grasping task with a small number of demonstrations +where BC fails majority of the time. The proposed algorithm replaces the target +network in off-policy actor-critic algorithms with a regularization technique +inspired by neural tangent kernel. We demonstrate that the proposed algorithm +can reach above 90\% success rate in under two hours of interaction time, with +only 50 human demonstrations, while BC and existing commonly-used RL algorithms +fail to achieve similar performance. + +
+
+ comment: In CoRL Workshop on Mastering Robot Manipulation in a World of + Abundant Data 2024 +
+
+
+
+
+ + ♻ ☆ From Novice to Skilled: RL-based Shared Autonomy Communicating with + Pilots in UAV Multi-Task Missions + + +
+ Multi-task missions for unmanned aerial vehicles (UAVs) involving inspection +and landing tasks are challenging for novice pilots due to the difficulties +associated with depth perception and the control interface. We propose a shared +autonomy system, alongside supplementary information displays, to assist pilots +to successfully complete multi-task missions without any pilot training. Our +approach comprises of three modules: (1) a perception module that encodes +visual information onto a latent representation, (2) a policy module that +augments pilot's actions, and (3) an information augmentation module that +provides additional information to the pilot. The policy module is trained in +simulation with simulated users and transferred to the real world without +modification in a user study (n=29), alongside alternative supplementary +information schemes including learnt red/green light feedback cues and an +augmented reality display. The pilot's intent is unknown to the policy module +and is inferred from the pilot's input and UAV's states. The assistant +increased task success rate for the landing and inspection tasks from [16.67% & +54.29%] respectively to [95.59% & 96.22%]. With the assistant, inexperienced +pilots achieved similar performance to experienced pilots. Red/green light +feedback cues reduced the required time by 19.53% and trajectory length by +17.86% for the inspection task, where participants rated it as their preferred +condition due to the intuitive interface and providing reassurance. This work +demonstrates that simple user models can train shared autonomy systems in +simulation, and transfer to physical tasks to estimate user intent and provide +effective assistance and information to the pilot. + +
+
+ comment: 37 pages, 11 figures, 6 tables. Accepted to ACM Transactions on + Human-Robot Interaction (THRI) +
+
+
+
+
+ + ♻ ☆ A 65 nm Bayesian Neural Network Accelerator with 360 fJ/Sample In-Word + GRNG for AI Uncertainty Estimation + + +
+ Uncertainty estimation is an indispensable capability for AI-enabled, +safety-critical applications, e.g. autonomous vehicles or medical diagnosis. +Bayesian neural networks (BNNs) use Bayesian statistics to provide both +classification predictions and uncertainty estimation, but they suffer from +high computational overhead associated with random number generation and +repeated sample iterations. Furthermore, BNNs are not immediately amenable to +acceleration through compute-in-memory architectures due to the frequent memory +writes necessary after each RNG operation. To address these challenges, we +present an ASIC that integrates 360 fJ/Sample Gaussian RNG directly into the +SRAM memory words. This integration reduces RNG overhead and enables +fully-parallel compute-in-memory operations for BNNs. The prototype chip +achieves 5.12 GSa/s RNG throughput and 102 GOp/s neural network throughput +while occupying 0.45 mm2, bringing AI uncertainty estimation to edge +computation. + +
+
+ comment: 7 pages, 12 figures +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 97 + +
+
+
+ + ☆ Accelerate High-Quality Diffusion Models with Inner Loop Feedback + + +
+ We propose Inner Loop Feedback (ILF), a novel approach to accelerate +diffusion models' inference. ILF trains a lightweight module to predict future +features in the denoising process by leveraging the outputs from a chosen +diffusion backbone block at a given time step. This approach exploits two key +intuitions; (1) the outputs of a given block at adjacent time steps are +similar, and (2) performing partial computations for a step imposes a lower +burden on the model than skipping the step entirely. Our method is highly +flexible, since we find that the feedback module itself can simply be a block +from the diffusion backbone, with all settings copied. Its influence on the +diffusion forward can be tempered with a learnable scaling factor from zero +initialization. We train this module using distillation losses; however, unlike +some prior work where a full diffusion backbone serves as the student, our +model freezes the backbone, training only the feedback module. While many +efforts to optimize diffusion models focus on achieving acceptable image +quality in extremely few steps (1-4 steps), our emphasis is on matching best +case results (typically achieved in 20 steps) while significantly reducing +runtime. ILF achieves this balance effectively, demonstrating strong +performance for both class-to-image generation with diffusion transformer (DiT) +and text-to-image generation with DiT-based PixArt-alpha and PixArt-sigma. The +quality of ILF's 1.7x-1.8x speedups are confirmed by FID, CLIP score, CLIP +Image Quality Assessment, ImageReward, and qualitative comparisons. + +
+
+ comment: submission currently under review; 20 pages, 17 figures, 6 tables +
+
+
+
+
+ + ☆ VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video + Understanding + + +
+ In this paper, we propose VideoLLaMA3, a more advanced multimodal foundation +model for image and video understanding. The core design philosophy of +VideoLLaMA3 is vision-centric. The meaning of "vision-centric" is two-fold: the +vision-centric training paradigm and vision-centric framework design. The key +insight of our vision-centric training paradigm is that high-quality image-text +data is crucial for both image and video understanding. Instead of preparing +massive video-text datasets, we focus on constructing large-scale and +high-quality image-text datasets. VideoLLaMA3 has four training stages: 1) +vision-centric alignment stage, which warms up the vision encoder and +projector; 2) vision-language pretraining stage, which jointly tunes the vision +encoder, projector, and LLM with large-scale image-text data covering multiple +types (including scene images, documents, charts) as well as text-only data. 3) +multi-task fine-tuning stage, which incorporates image-text SFT data for +downstream tasks and video-text data to establish a foundation for video +understanding. 4) video-centric fine-tuning, which further improves the model's +capability in video understanding. As for the framework design, to better +capture fine-grained details in images, the pretrained vision encoder is +adapted to encode images of varying sizes into vision tokens with corresponding +numbers, rather than a fixed number of tokens. For video inputs, we reduce the +number of vision tokens according to their similarity so that the +representation of videos will be more precise and compact. Benefit from +vision-centric designs, VideoLLaMA3 achieves compelling performances in both +image and video understanding benchmarks. + +
+
+ comment: BZ, KL, ZC, ZH, YY, GC, SL, YJ, HZ, and XL contributed equally to + this project. Code: https://github.com/DAMO-NLP-SG/VideoLLaMA3 +
+
+
+
+
+ + ☆ Neural Radiance Fields for the Real World: A Survey + + +
+ Neural Radiance Fields (NeRFs) have remodeled 3D scene representation since +release. NeRFs can effectively reconstruct complex 3D scenes from 2D images, +advancing different fields and applications such as scene understanding, 3D +content generation, and robotics. Despite significant research progress, a +thorough review of recent innovations, applications, and challenges is lacking. +This survey compiles key theoretical advancements and alternative +representations and investigates emerging challenges. It further explores +applications on reconstruction, highlights NeRFs' impact on computer vision and +robotics, and reviews essential datasets and toolkits. By identifying gaps in +the literature, this survey discusses open challenges and offers directions for +future research. + +
+
+
+
+
+ + ☆ Robust Representation Consistency Model via Contrastive Denoising + + +
+ Robustness is essential for deep neural networks, especially in +security-sensitive applications. To this end, randomized smoothing provides +theoretical guarantees for certifying robustness against adversarial +perturbations. Recently, diffusion models have been successfully employed for +randomized smoothing to purify noise-perturbed samples before making +predictions with a standard classifier. While these methods excel at small +perturbation radii, they struggle with larger perturbations and incur a +significant computational overhead during inference compared to classical +methods. To address this, we reformulate the generative modeling task along the +diffusion trajectories in pixel space as a discriminative task in the latent +space. Specifically, we use instance discrimination to achieve consistent +representations along the trajectories by aligning temporally adjacent points. +After fine-tuning based on the learned representations, our model enables +implicit denoising-then-classification via a single prediction, substantially +reducing inference costs. We conduct extensive experiments on various datasets +and achieve state-of-the-art performance with minimal computation budget during +inference. For example, our method outperforms the certified accuracy of +diffusion-based methods on ImageNet across all perturbation radii by 5.3% on +average, with up to 11.6% at larger radii, while reducing inference costs by +85$\times$ on average. Codes are available at: +https://github.com/jiachenlei/rRCM. + +
+
+
+
+
+ + ☆ Orchid: Image Latent Diffusion for Joint Appearance and Geometry + Generation + + +
+ Diffusion models are state-of-the-art for image generation. Trained on large +datasets, they capture expressive image priors that have been used for tasks +like inpainting, depth, and (surface) normal prediction. However, these models +are typically trained for one specific task, e.g., a separate model for each of +color, depth, and normal prediction. Such models do not leverage the intrinsic +correlation between appearance and geometry, often leading to inconsistent +predictions. + In this paper, we propose using a novel image diffusion prior that jointly +encodes appearance and geometry. We introduce a diffusion model Orchid, +comprising a Variational Autoencoder (VAE) to encode color, depth, and surface +normals to a latent space, and a Latent Diffusion Model (LDM) for generating +these joint latents. Orchid directly generates photo-realistic color images, +relative depth, and surface normals from user-provided text, and can be used to +create image-aligned partial 3D scenes seamlessly. It can also perform +image-conditioned tasks like joint monocular depth and normal prediction and is +competitive in accuracy to state-of-the-art methods designed for those tasks +alone. Lastly, our model learns a joint prior that can be used zero-shot as a +regularizer for many inverse problems that entangle appearance and geometry. +For example, we demonstrate its effectiveness in color-depth-normal inpainting, +showcasing its applicability to problems in 3D generation from sparse views. + +
+
+ comment: Project webpage: https://orchid3d.github.io +
+
+
+
+
+ + ☆ CHaRNet: Conditioned Heatmap Regression for Robust Dental Landmark + Localization + + +
+ Identifying anatomical landmarks in 3D dental models is crucial for +orthodontic treatment. Manually placing these key points is complex, +time-consuming, and requires expert knowledge. While some machine learning +methods have been proposed for automatic tooth landmark detection in 3D +Intraoral Scans (IOS), research remains limited, with no fully end-to-end +approaches that avoid teeth segmentation. + We propose CHaRNet (Conditioned Heatmap Regression Network), the first +end-to-end deep learning method for tooth landmark detection in 3D IOS. Unlike +traditional two-stage methods that segment teeth before detecting landmarks, +CHaRNet directly detects landmarks on the input point cloud. It consists of +four key modules: (1) a point cloud encoder, (2) a point cloud decoder with a +heatmap regression head, (3) a teeth presence classification head, and (4) the +innovative Conditioned Heatmap Regression (CHaR) module. The CHaR module +refines landmark regression by leveraging teeth presence classification, +enabling dynamic adaptation to cases with missing teeth and improving accuracy +in complex dental models. + We evaluate CHaRNet using five point cloud learning algorithms to validate +the effectiveness of the CHaR module and test it on a clinical dataset of +$1,214$ annotated 3D dental models. Both the dataset and code will be publicly +released to address the lack of open datasets in orthodontics, promote +benchmarking, and inspire new research. + CHaRNet achieves a Mean Euclidean Distance Error (MEDE) of 1.28 mm and a Mean +Success Ratio (MSR) of 82.40\%, demonstrating robust performance. Notably, it +excels in handling irregular dental geometries, such as models with missing +teeth. This end-to-end approach streamlines orthodontic workflows, improves 3D +IOS analysis precision, and facilitates efficient computer-assisted treatment +planning. + +
+
+
+
+
+ + ☆ Robust Body Composition Analysis by Generating 3D CT Volumes from + Limited 2D Slices + + +
+ Body composition analysis provides valuable insights into aging, disease +progression, and overall health conditions. Due to concerns of radiation +exposure, two-dimensional (2D) single-slice computed tomography (CT) imaging +has been used repeatedly for body composition analysis. However, this approach +introduces significant spatial variability that can impact the accuracy and +robustness of the analysis. To mitigate this issue and facilitate body +composition analysis, this paper presents a novel method to generate 3D CT +volumes from limited number of 2D slices using a latent diffusion model (LDM). +Our approach first maps 2D slices into a latent representation space using a +variational autoencoder. An LDM is then trained to capture the 3D context of a +stack of these latent representations. To accurately interpolate +intermediateslices and construct a full 3D volume, we utilize body part +regression to determine the spatial location and distance between the acquired +slices. Experiments on both in-house and public 3D abdominal CT datasets +demonstrate that the proposed method significantly enhances body composition +analysis compared to traditional 2D-based analysis, with a reduced error rate +from 23.3% to 15.2%. + +
+
+
+
+
+ + ☆ Beyond the Lungs: Extending the Field of View in Chest CT with Latent + Diffusion Models + + +
+ The interconnection between the human lungs and other organs, such as the +liver and kidneys, is crucial for understanding the underlying risks and +effects of lung diseases and improving patient care. However, most research +chest CT imaging is focused solely on the lungs due to considerations of cost +and radiation dose. This restricted field of view (FOV) in the acquired images +poses challenges to comprehensive analysis and hinders the ability to gain +insights into the impact of lung diseases on other organs. To address this, we +propose SCOPE (Spatial Coverage Optimization with Prior Encoding), a novel +approach to capture the inter-organ relationships from CT images and extend the +FOV of chest CT images. Our approach first trains a variational autoencoder +(VAE) to encode 2D axial CT slices individually, then stacks the latent +representations of the VAE to form a 3D context for training a latent diffusion +model. Once trained, our approach extends the FOV of CT images in the +z-direction by generating new axial slices in a zero-shot manner. We evaluated +our approach on the National Lung Screening Trial (NLST) dataset, and results +suggest that it effectively extends the FOV to include the liver and kidneys, +which are not completely covered in the original NLST data acquisition. +Quantitative results on a held-out whole-body dataset demonstrate that the +generated slices exhibit high fidelity with acquired data, achieving an SSIM of +0.81. + +
+
+
+
+
+ + ☆ SMART-Vision: Survey of Modern Action Recognition Techniques in Vision + + +
+ Human Action Recognition (HAR) is a challenging domain in computer vision, +involving recognizing complex patterns by analyzing the spatiotemporal dynamics +of individuals' movements in videos. These patterns arise in sequential data, +such as video frames, which are often essential to accurately distinguish +actions that would be ambiguous in a single image. HAR has garnered +considerable interest due to its broad applicability, ranging from robotics and +surveillance systems to sports motion analysis, healthcare, and the burgeoning +field of autonomous vehicles. While several taxonomies have been proposed to +categorize HAR approaches in surveys, they often overlook hybrid methodologies +and fail to demonstrate how different models incorporate various architectures +and modalities. In this comprehensive survey, we present the novel SMART-Vision +taxonomy, which illustrates how innovations in deep learning for HAR complement +one another, leading to hybrid approaches beyond traditional categories. Our +survey provides a clear roadmap from foundational HAR works to current +state-of-the-art systems, highlighting emerging research directions and +addressing unresolved challenges in discussion sections for architectures +within the HAR domain. We provide details of the research datasets that various +approaches used to measure and compare goodness HAR approaches. We also explore +the rapidly emerging field of Open-HAR systems, which challenges HAR systems by +presenting samples from unknown, novel classes during test time. + +
+
+
+
+
+ + ☆ A polynomial formula for the perspective four points problem + + +
+ We present a fast and accurate solution to the perspective n-points problem, +by way of a new approach to the n=4 case. Our solution hinges on a novel +separation of variables: given four 3D points and four corresponding 2D points +on the camera canvas, we start by finding another set of 3D points, sitting on +the rays connecting the camera to the 2D canvas points, so that the six +pair-wise distances between these 3D points are as close as possible to the six +distances between the original 3D points. This step reduces the perspective +problem to an absolute orientation problem (which has a solution via explicit +formula). To solve the first problem we set coordinates which are as +orientation-free as possible: on the 3D points side our coordinates are the +squared distances between the points. On the 2D canvas-points side our +coordinates are the dot products of the points after rotating one of them to +sit on the optical axis. We then derive the solution with the help of a +computer algebra system. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ STMDNet: A Lightweight Directional Framework for Motion Pattern + Recognition of Tiny Targets + + +
+ Recognizing motions of tiny targets - only few dozen pixels - in cluttered +backgrounds remains a fundamental challenge when standard feature-based or deep +learning methods fail under scarce visual cues. We propose STMDNet, a +model-based computational framework to Recognize motions of tiny targets at +variable velocities under low-sampling frequency scenarios. STMDNet designs a +novel dual-dynamics-and-correlation mechanism, harnessing ipsilateral +excitation to integrate target cues and leakage-enhancing-type contralateral +inhibition to suppress large-object and background motion interference. +Moreover, we develop the first collaborative directional encoding-decoding +strategy that determines the motion direction from only one correlation per +spatial location, cutting computational costs to one-eighth of prior methods. +Further, simply substituting the backbone of a strong STMD model with STMDNet +raises AUC by 24%, yielding an enhanced STMDNet-F. Evaluations on real-world +low sampling frequency datasets show state-of-the-art results, surpassing the +deep learning baseline. Across diverse speeds, STMDNet-F improves mF1 by 19%, +16%, and 8% at 240Hz, 120Hz, and 60Hz, respectively, while STMDNet achieves 87 +FPS on a single CPU thread. These advances highlight STMDNet as a +next-generation backbone for tiny target motion pattern recognition and +underscore its broader potential to revitalize model-based visual approaches in +motion detection. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ Sketch and Patch: Efficient 3D Gaussian Representation for Man-Made + Scenes + + +
+ 3D Gaussian Splatting (3DGS) has emerged as a promising representation for +photorealistic rendering of 3D scenes. However, its high storage requirements +pose significant challenges for practical applications. We observe that +Gaussians exhibit distinct roles and characteristics that are analogous to +traditional artistic techniques -- Like how artists first sketch outlines +before filling in broader areas with color, some Gaussians capture +high-frequency features like edges and contours; While other Gaussians +represent broader, smoother regions, that are analogous to broader brush +strokes that add volume and depth to a painting. Based on this observation, we +propose a novel hybrid representation that categorizes Gaussians into (i) +Sketch Gaussians, which define scene boundaries, and (ii) Patch Gaussians, +which cover smooth regions. Sketch Gaussians are efficiently encoded using +parametric models, leveraging their geometric coherence, while Patch Gaussians +undergo optimized pruning, retraining, and vector quantization to maintain +volumetric consistency and storage efficiency. Our comprehensive evaluation +across diverse indoor and outdoor scenes demonstrates that this structure-aware +approach achieves up to 32.62% improvement in PSNR, 19.12% in SSIM, and 45.41% +in LPIPS at equivalent model sizes, and correspondingly, for an indoor scene, +our model maintains the visual quality with 2.3% of the original model size. + +
+
+
+
+
+ + ☆ Learning accurate rigid registration for longitudinal brain MRI from + synthetic data + + +
+ Rigid registration aims to determine the translations and rotations necessary +to align features in a pair of images. While recent machine learning methods +have become state-of-the-art for linear and deformable registration across +subjects, they have demonstrated limitations when applied to longitudinal +(within-subject) registration, where achieving precise alignment is critical. +Building on an existing framework for anatomy-aware, acquisition-agnostic +affine registration, we propose a model optimized for longitudinal, rigid brain +registration. By training the model with synthetic within-subject pairs +augmented with rigid and subtle nonlinear transforms, the model estimates more +accurate rigid transforms than previous cross-subject networks and performs +robustly on longitudinal registration pairs within and across magnetic +resonance imaging (MRI) contrasts. + +
+
+ comment: 5 pages, 4 figures, 1 table, rigid image registration, deep learning, + longitudinal analysis, neuroimaging, accepted by the IEEE International + Symposium on Biomedical Imaging +
+
+
+
+
+ + ☆ Deep Learning-Based Image Recovery and Pose Estimation for Resident + Space Objects + + +
+ As the density of spacecraft in Earth's orbit increases, their recognition, +pose and trajectory identification becomes crucial for averting potential +collisions and executing debris removal operations. However, training models +able to identify a spacecraft and its pose presents a significant challenge due +to a lack of available image data for model training. This paper puts forth an +innovative framework for generating realistic synthetic datasets of Resident +Space Object (RSO) imagery. Using the International Space Station (ISS) as a +test case, it goes on to combine image regression with image restoration +methodologies to estimate pose from blurred images. An analysis of the proposed +image recovery and regression techniques was undertaken, providing insights +into the performance, potential enhancements and limitations when applied to +real imagery of RSOs. The image recovery approach investigated involves first +applying image deconvolution using an effective point spread function, followed +by detail object extraction with a U-Net. Interestingly, using only U-Net for +image reconstruction the best pose performance was attained, reducing the +average Mean Squared Error in image recovery by 97.28% and the average angular +error by 71.9%. The successful application of U-Net image restoration combined +with the Resnet50 regression network for pose estimation of the International +Space Station demonstrates the value of a diverse set of evaluation tools for +effective solutions to real-world problems such as the analysis of distant +objects in Earth's orbit. + +
+
+ comment: 10 pages, 13 figures +
+
+
+
+
+ + ☆ UniUIR: Considering Underwater Image Restoration as An All-in-One + Learner + + +
+ Existing underwater image restoration (UIR) methods generally only handle +color distortion or jointly address color and haze issues, but they often +overlook the more complex degradations that can occur in underwater scenes. To +address this limitation, we propose a Universal Underwater Image Restoration +method, termed as UniUIR, considering the complex scenario of real-world +underwater mixed distortions as an all-in-one manner. To decouple +degradation-specific issues and explore the inter-correlations among various +degradations in UIR task, we designed the Mamba Mixture-of-Experts module. This +module enables each expert to identify distinct types of degradation and +collaboratively extract task-specific priors while maintaining global feature +representation based on linear complexity. Building upon this foundation, to +enhance degradation representation and address the task conflicts that arise +when handling multiple types of degradation, we introduce the spatial-frequency +prior generator. This module extracts degradation prior information in both +spatial and frequency domains, and adaptively selects the most appropriate +task-specific prompts based on image content, thereby improving the accuracy of +image restoration. Finally, to more effectively address complex, +region-dependent distortions in UIR task, we incorporate depth information +derived from a large-scale pre-trained depth prediction model, thereby enabling +the network to perceive and leverage depth variations across different image +regions to handle localized degradation. Extensive experiments demonstrate that +UniUIR can produce more attractive results across qualitative and quantitative +comparisons, and shows strong generalization than state-of-the-art methods. + +
+
+ comment: 13 pages, 10 figures +
+
+
+
+
+ + ☆ LiT: Delving into a Simplified Linear Diffusion Transformer for Image + Generation + + +
+ In commonly used sub-quadratic complexity modules, linear attention benefits +from simplicity and high parallelism, making it promising for image synthesis +tasks. However, the architectural design and learning strategy for linear +attention remain underexplored in this field. In this paper, we offer a suite +of ready-to-use solutions for efficient linear diffusion Transformers. Our core +contributions include: (1) Simplified Linear Attention using few heads, +observing the free-lunch effect of performance without latency increase. (2) +Weight inheritance from a fully pre-trained diffusion Transformer: initializing +linear Transformer using pre-trained diffusion Transformer and loading all +parameters except for those related to linear attention. (3) Hybrid knowledge +distillation objective: using a pre-trained diffusion Transformer to help the +training of the student linear Transformer, supervising not only the predicted +noise but also the variance of the reverse diffusion process. These guidelines +lead to our proposed Linear Diffusion Transformer (LiT), an efficient +text-to-image Transformer that can be deployed offline on a laptop. Experiments +show that in class-conditional 256*256 and 512*512 ImageNet benchmark LiT +achieves highly competitive FID while reducing training steps by 80% and 77% +compared to DiT. LiT also rivals methods based on Mamba or Gated Linear +Attention. Besides, for text-to-image generation, LiT allows for the rapid +synthesis of up to 1K resolution photorealistic images. Project page: +https://techmonsterwang.github.io/LiT/. + +
+
+ comment: 21 pages, 12 figures +
+
+
+
+
+ + ☆ MorphoSkel3D: Morphological Skeletonization of 3D Point Clouds for + Informed Sampling in Object Classification and Retrieval + + +
+ Point clouds are a set of data points in space to represent the 3D geometry +of objects. A fundamental step in the processing is to identify a subset of +points to represent the shape. While traditional sampling methods often ignore +to incorporate geometrical information, recent developments in learning-based +sampling models have achieved significant levels of performance. With the +integration of geometrical priors, the ability to learn and preserve the +underlying structure can be enhanced when sampling. To shed light into the +shape, a qualitative skeleton serves as an effective descriptor to guide +sampling for both local and global geometries. In this paper, we introduce +MorphoSkel3D as a new technique based on morphology to facilitate an efficient +skeletonization of shapes. With its low computational cost, MorphoSkel3D is a +unique, rule-based algorithm to benchmark its quality and performance on two +large datasets, ModelNet and ShapeNet, under different sampling ratios. The +results show that training with MorphoSkel3D leads to an informed and more +accurate sampling in the practical application of object classification and +point cloud retrieval. + +
+
+
+
+
+ + ☆ A Novel Tracking Framework for Devices in X-ray Leveraging Supplementary + Cue-Driven Self-Supervised Features + + +
+ To restore proper blood flow in blocked coronary arteries via angioplasty +procedure, accurate placement of devices such as catheters, balloons, and +stents under live fluoroscopy or diagnostic angiography is crucial. Identified +balloon markers help in enhancing stent visibility in X-ray sequences, while +the catheter tip aids in precise navigation and co-registering vessel +structures, reducing the need for contrast in angiography. However, accurate +detection of these devices in interventional X-ray sequences faces significant +challenges, particularly due to occlusions from contrasted vessels and other +devices and distractions from surrounding, resulting in the failure to track +such small objects. While most tracking methods rely on spatial correlation of +past and current appearance, they often lack strong motion comprehension +essential for navigating through these challenging conditions, and fail to +effectively detect multiple instances in the scene. To overcome these +limitations, we propose a self-supervised learning approach that enhances its +spatio-temporal understanding by incorporating supplementary cues and learning +across multiple representation spaces on a large dataset. Followed by that, we +introduce a generic real-time tracking framework that effectively leverages the +pretrained spatio-temporal network and also takes the historical appearance and +trajectory data into account. This results in enhanced localization of multiple +instances of device landmarks. Our method outperforms state-of-the-art methods +in interventional X-ray device tracking, especially stability and robustness, +achieving an 87% reduction in max error for balloon marker detection and a 61% +reduction in max error for catheter tip detection. + +
+
+
+
+
+ + ☆ 3D Object Manipulation in a Single Image using Generative Models + + +
+ Object manipulation in images aims to not only edit the object's presentation +but also gift objects with motion. Previous methods encountered challenges in +concurrently handling static editing and dynamic generation, while also +struggling to achieve fidelity in object appearance and scene lighting. In this +work, we introduce \textbf{OMG3D}, a novel framework that integrates the +precise geometric control with the generative power of diffusion models, thus +achieving significant enhancements in visual performance. Our framework first +converts 2D objects into 3D, enabling user-directed modifications and lifelike +motions at the geometric level. To address texture realism, we propose +CustomRefiner, a texture refinement module that pre-train a customized +diffusion model, aligning the details and style of coarse renderings of 3D +rough model with the original image, further refine the texture. Additionally, +we introduce IllumiCombiner, a lighting processing module that estimates and +corrects background lighting to match human visual perception, resulting in +more realistic shadow effects. Extensive experiments demonstrate the +outstanding visual performance of our approach in both static and dynamic +scenarios. Remarkably, all these steps can be done using one NVIDIA 3090. +Project page is at https://whalesong-zrs.github.io/OMG3D-projectpage/ + +
+
+
+
+
+ + ☆ DynamicEarth: How Far are We from Open-Vocabulary Change Detection? + + +
+ Monitoring Earth's evolving land covers requires methods capable of detecting +changes across a wide range of categories and contexts. Existing change +detection methods are hindered by their dependency on predefined classes, +reducing their effectiveness in open-world applications. To address this issue, +we introduce open-vocabulary change detection (OVCD), a novel task that bridges +vision and language to detect changes across any category. Considering the lack +of high-quality data and annotation, we propose two training-free frameworks, +M-C-I and I-M-C, which leverage and integrate off-the-shelf foundation models +for the OVCD task. The insight behind the M-C-I framework is to discover all +potential changes and then classify these changes, while the insight of I-M-C +framework is to identify all targets of interest and then determine whether +their states have changed. Based on these two frameworks, we instantiate to +obtain several methods, e.g., SAM-DINOv2-SegEarth-OV, Grounding-DINO-SAM2-DINO, +etc. Extensive evaluations on 5 benchmark datasets demonstrate the superior +generalization and robustness of our OVCD methods over existing supervised and +unsupervised methods. To support continued exploration, we release +DynamicEarth, a dedicated codebase designed to advance research and application +of OVCD. https://likyoo.github.io/DynamicEarth + +
+
+
+
+
+ + ☆ PreciseCam: Precise Camera Control for Text-to-Image Generation + + +
+ Images as an artistic medium often rely on specific camera angles and lens +distortions to convey ideas or emotions; however, such precise control is +missing in current text-to-image models. We propose an efficient and general +solution that allows precise control over the camera when generating both +photographic and artistic images. Unlike prior methods that rely on predefined +shots, we rely solely on four simple extrinsic and intrinsic camera parameters, +removing the need for pre-existing geometry, reference 3D objects, and +multi-view data. We also present a novel dataset with more than 57,000 images, +along with their text prompts and ground-truth camera parameters. Our +evaluation shows precise camera control in text-to-image generation, surpassing +traditional prompt engineering approaches. Our data, model, and code are +publicly available at https://graphics.unizar.es/projects/PreciseCam2024. + +
+
+
+
+
+ + ☆ DocTTT: Test-Time Training for Handwritten Document Recognition Using + Meta-Auxiliary Learning + + +
+ Despite recent significant advancements in Handwritten Document Recognition +(HDR), the efficient and accurate recognition of text against complex +backgrounds, diverse handwriting styles, and varying document layouts remains a +practical challenge. Moreover, this issue is seldom addressed in academic +research, particularly in scenarios with minimal annotated data available. In +this paper, we introduce the DocTTT framework to address these challenges. The +key innovation of our approach is that it uses test-time training to adapt the +model to each specific input during testing. We propose a novel Meta-Auxiliary +learning approach that combines Meta-learning and self-supervised Masked +Autoencoder~(MAE). During testing, we adapt the visual representation +parameters using a self-supervised MAE loss. During training, we learn the +model parameters using a meta-learning framework, so that the model parameters +are learned to adapt to a new input effectively. Experimental results show that +our proposed method significantly outperforms existing state-of-the-art +approaches on benchmark datasets. + +
+
+ comment: WACV2025, camera ready with updated reference +
+
+
+
+
+ + ☆ CrossDiff: Diffusion Probabilistic Model With Cross-conditional + Encoder-Decoder for Crack Segmentation + + +
+ Crack Segmentation in industrial concrete surfaces is a challenging task +because cracks usually exhibit intricate morphology with slender appearances. +Traditional segmentation methods often struggle to accurately locate such +cracks, leading to inefficiencies in maintenance and repair processes. In this +paper, we propose a novel diffusion-based model with a cross-conditional +encoder-decoder, named CrossDiff, which is the first to introduce the diffusion +probabilistic model for the crack segmentation task. Specifically, CrossDiff +integrates a cross-encoder and a cross-decoder into the diffusion model to +constitute a cross-shaped diffusion model structure. The cross-encoder enhances +the ability to retain crack details and the cross-decoder helps extract the +semantic features of cracks. As a result, CrossDiff can better handle slender +cracks. Extensive experiments were conducted on five challenging crack datasets +including CFD, CrackTree200, DeepCrack, GAPs384, and Rissbilder. The results +demonstrate that the proposed CrossDiff model achieves impressive performance, +outperforming other state-of-the-art methods by 8.0% in terms of both Dice +score and IoU. The code will be open-source soon. + +
+
+
+
+
+ + ☆ GAMED-Snake: Gradient-aware Adaptive Momentum Evolution Deep Snake Model + for Multi-organ Segmentation + + +
+ Multi-organ segmentation is a critical yet challenging task due to complex +anatomical backgrounds, blurred boundaries, and diverse morphologies. This +study introduces the Gradient-aware Adaptive Momentum Evolution Deep Snake +(GAMED-Snake) model, which establishes a novel paradigm for contour-based +segmentation by integrating gradient-based learning with adaptive momentum +evolution mechanisms. The GAMED-Snake model incorporates three major +innovations: First, the Distance Energy Map Prior (DEMP) generates a +pixel-level force field that effectively attracts contour points towards the +true boundaries, even in scenarios with complex backgrounds and blurred edges. +Second, the Differential Convolution Inception Module (DCIM) precisely extracts +comprehensive energy gradients, significantly enhancing segmentation accuracy. +Third, the Adaptive Momentum Evolution Mechanism (AMEM) employs cross-attention +to establish dynamic features across different iterations of evolution, +enabling precise boundary alignment for diverse morphologies. Experimental +results on four challenging multi-organ segmentation datasets demonstrate that +GAMED-Snake improves the mDice metric by approximately 2% compared to +state-of-the-art methods. Code will be available at +https://github.com/SYSUzrc/GAMED-Snake. + +
+
+
+
+
+ + ☆ AMM-Diff: Adaptive Multi-Modality Diffusion Network for Missing Modality + Imputation + + +
+ In clinical practice, full imaging is not always feasible, often due to +complex acquisition protocols, stringent privacy regulations, or specific +clinical needs. However, missing MR modalities pose significant challenges for +tasks like brain tumor segmentation, especially in deep learning-based +segmentation, as each modality provides complementary information crucial for +improving accuracy. A promising solution is missing data imputation, where +absent modalities are generated from available ones. While generative models +have been widely used for this purpose, most state-of-the-art approaches are +limited to single or dual target translations, lacking the adaptability to +generate missing modalities based on varying input configurations. To address +this, we propose an Adaptive Multi-Modality Diffusion Network (AMM-Diff), a +novel diffusion-based generative model capable of handling any number of input +modalities and generating the missing ones. We designed an Image-Frequency +Fusion Network (IFFN) that learns a unified feature representation through a +self-supervised pretext task across the full input modalities and their +selected high-frequency Fourier components. The proposed diffusion model +leverages this representation, encapsulating prior knowledge of the complete +modalities, and combines it with an adaptive reconstruction strategy to achieve +missing modality completion. Experimental results on the BraTS 2021 dataset +demonstrate the effectiveness of our approach. + +
+
+
+
+
+ + ☆ FDG-Diff: Frequency-Domain-Guided Diffusion Framework for Compressed + Hazy Image Restoration + + +
+ In this study, we reveal that the interaction between haze degradation and +JPEG compression introduces complex joint loss effects, which significantly +complicate image restoration. Existing dehazing models often neglect +compression effects, which limits their effectiveness in practical +applications. To address these challenges, we introduce three key +contributions. First, we design FDG-Diff, a novel frequency-domain-guided +dehazing framework that improves JPEG image restoration by leveraging +frequency-domain information. Second, we introduce the High-Frequency +Compensation Module (HFCM), which enhances spatial-domain detail restoration by +incorporating frequency-domain augmentation techniques into a diffusion-based +restoration framework. Lastly, the introduction of the Degradation-Aware +Denoising Timestep Predictor (DADTP) module further enhances restoration +quality by enabling adaptive region-specific restoration, effectively +addressing regional degradation inconsistencies in compressed hazy images. +Experimental results across multiple compressed dehazing datasets demonstrate +that our method consistently outperforms the latest state-of-the-art +approaches. Code be available at https://github.com/SYSUzrc/FDG-Diff. + +
+
+
+
+
+ + ☆ Enhancing Monocular Depth Estimation with Multi-Source Auxiliary Tasks + + +
+ Monocular depth estimation (MDE) is a challenging task in computer vision, +often hindered by the cost and scarcity of high-quality labeled datasets. We +tackle this challenge using auxiliary datasets from related vision tasks for an +alternating training scheme with a shared decoder built on top of a pre-trained +vision foundation model, while giving a higher weight to MDE. Through extensive +experiments we demonstrate the benefits of incorporating various in-domain +auxiliary datasets and tasks to improve MDE quality on average by ~11%. Our +experimental analysis shows that auxiliary tasks have different impacts, +confirming the importance of task selection, highlighting that quality gains +are not achieved by merely adding data. Remarkably, our study reveals that +using semantic segmentation datasets as Multi-Label Dense Classification (MLDC) +often results in additional quality gains. Lastly, our method significantly +improves the data efficiency for the considered MDE datasets, enhancing their +quality while reducing their size by at least 80%. This paves the way for using +auxiliary data from related tasks to improve MDE quality despite limited +availability of high-quality labeled data. Code is available at +https://jugit.fz-juelich.de/ias-8/mdeaux. + +
+
+ comment: Paper accepted at WACV 2025 +
+
+
+
+
+ + ☆ Machine Learning Modeling for Multi-order Human Visual Motion Processing + + +
+ Our research aims to develop machines that learn to perceive visual motion as +do humans. While recent advances in computer vision (CV) have enabled DNN-based +models to accurately estimate optical flow in naturalistic images, a +significant disparity remains between CV models and the biological visual +system in both architecture and behavior. This disparity includes humans' +ability to perceive the motion of higher-order image features (second-order +motion), which many CV models fail to capture because of their reliance on the +intensity conservation law. Our model architecture mimics the cortical V1-MT +motion processing pathway, utilizing a trainable motion energy sensor bank and +a recurrent graph network. Supervised learning employing diverse naturalistic +videos allows the model to replicate psychophysical and physiological findings +about first-order (luminance-based) motion perception. For second-order motion, +inspired by neuroscientific findings, the model includes an additional sensing +pathway with nonlinear preprocessing before motion energy sensing, implemented +using a simple multilayer 3D CNN block. When exploring how the brain acquired +the ability to perceive second-order motion in natural environments, in which +pure second-order signals are rare, we hypothesized that second-order +mechanisms were critical when estimating robust object motion amidst optical +fluctuations, such as highlights on glossy surfaces. We trained our +dual-pathway model on novel motion datasets with varying material properties of +moving objects. We found that training to estimate object motion from +non-Lambertian materials naturally endowed the model with the capacity to +perceive second-order motion, as can humans. The resulting model effectively +aligns with biological systems while generalizing to both first- and +second-order motion phenomena in natural scenes. + +
+
+
+
+
+ + ☆ Modality Unified Attack for Omni-Modality Person Re-Identification + + +
+ Deep learning based person re-identification (re-id) models have been widely +employed in surveillance systems. Recent studies have demonstrated that +black-box single-modality and cross-modality re-id models are vulnerable to +adversarial examples (AEs), leaving the robustness of multi-modality re-id +models unexplored. Due to the lack of knowledge about the specific type of +model deployed in the target black-box surveillance system, we aim to generate +modality unified AEs for omni-modality (single-, cross- and multi-modality) +re-id models. Specifically, we propose a novel Modality Unified Attack method +to train modality-specific adversarial generators to generate AEs that +effectively attack different omni-modality models. A multi-modality model is +adopted as the surrogate model, wherein the features of each modality are +perturbed by metric disruption loss before fusion. To collapse the common +features of omni-modality models, Cross Modality Simulated Disruption approach +is introduced to mimic the cross-modality feature embeddings by intentionally +feeding images to non-corresponding modality-specific subnetworks of the +surrogate model. Moreover, Multi Modality Collaborative Disruption strategy is +devised to facilitate the attacker to comprehensively corrupt the informative +content of person images by leveraging a multi modality feature collaborative +metric disruption loss. Extensive experiments show that our MUA method can +effectively attack the omni-modality re-id models, achieving 55.9%, 24.4%, +49.0% and 62.7% mean mAP Drop Rate, respectively. + +
+
+ comment: 9 pages,3 figures +
+
+
+
+
+ + ☆ Patent Figure Classification using Large Vision-language Models + + +
+ Patent figure classification facilitates faceted search in patent retrieval +systems, enabling efficient prior art search. Existing approaches have explored +patent figure classification for only a single aspect and for aspects with a +limited number of concepts. In recent years, large vision-language models +(LVLMs) have shown tremendous performance across numerous computer vision +downstream tasks, however, they remain unexplored for patent figure +classification. Our work explores the efficacy of LVLMs in patent figure visual +question answering (VQA) and classification, focusing on zero-shot and few-shot +learning scenarios. For this purpose, we introduce new datasets, PatFigVQA and +PatFigCLS, for fine-tuning and evaluation regarding multiple aspects of patent +figures~(i.e., type, projection, patent class, and objects). For a +computational-effective handling of a large number of classes using LVLM, we +propose a novel tournament-style classification strategy that leverages a +series of multiple-choice questions. Experimental results and comparisons of +multiple classification approaches based on LVLMs and Convolutional Neural +Networks (CNNs) in few-shot settings show the feasibility of the proposed +approaches. + +
+
+
+
+
+ + ☆ Bad-PFL: Exploring Backdoor Attacks against Personalized Federated + Learning ICLR 2025 + + +
+ Data heterogeneity and backdoor attacks rank among the most significant +challenges facing federated learning (FL). For data heterogeneity, personalized +federated learning (PFL) enables each client to maintain a private personalized +model to cater to client-specific knowledge. Meanwhile, vanilla FL has proven +vulnerable to backdoor attacks. However, recent advancements in PFL community +have demonstrated a potential immunity against such attacks. This paper +explores this intersection further, revealing that existing federated backdoor +attacks fail in PFL because backdoors about manually designed triggers struggle +to survive in personalized models. To tackle this, we design Bad-PFL, which +employs features from natural data as our trigger. As long as the model is +trained on natural data, it inevitably embeds the backdoor associated with our +trigger, ensuring its longevity in personalized models. Moreover, our trigger +undergoes mutual reinforcement training with the model, further solidifying the +backdoor's durability and enhancing attack effectiveness. The large-scale +experiments across three benchmark datasets demonstrate the superior +performance of our attack against various PFL methods, even when equipped with +state-of-the-art defense mechanisms. + +
+
+ comment: Accepted to ICLR 2025 +
+
+
+
+
+ + ☆ Combining Knowledge Graph and LLMs for Enhanced Zero-shot Visual + Question Answering + + +
+ Zero-shot visual question answering (ZS-VQA), an emerged critical research +area, intends to answer visual questions without providing training samples. +Existing research in ZS-VQA has proposed to leverage knowledge graphs or large +language models (LLMs), respectively, as external information sources to help +VQA model comprehend images and questions. However, LLMs often struggle in +accurately interpreting specific question meanings. Meanwhile, although +knowledge graph has rich entity relationships, it is challenging to effectively +connect entities to individual image content for visual question answers. In +this paper, we propose a novel design to combine knowledge graph and LLMs for +zero-shot visual question answer. Our approach uses LLMs' powerful +understanding capabilities to accurately interpret image content through a +strategic question search mechanism. Meanwhile, the knowledge graph is used to +expand and connect users' queries to the image content for better visual +question answering. An optimization algorithm is further used to determine the +optimal weights for the loss functions derived from different information +sources, towards a globally optimal set of candidate answers. Experimental +results on two benchmark datasets demonstrate that our model achieves +state-of-the-art (SOTA) performance. Both source code and benchmark data will +be released for public access. + +
+
+
+
+
+ + ☆ Can masking background and object reduce static bias for zero-shot + action recognition? + + +
+ In this paper, we address the issue of static bias in zero-shot action +recognition. Action recognition models need to represent the action itself, not +the appearance. However, some fully-supervised works show that models often +rely on static appearances, such as the background and objects, rather than +human actions. This issue, known as static bias, has not been investigated for +zero-shot. Although CLIP-based zero-shot models are now common, it remains +unclear if they sufficiently focus on human actions, as CLIP primarily captures +appearance features related to languages. In this paper, we investigate the +influence of static bias in zero-shot action recognition with CLIP-based +models. Our approach involves masking backgrounds, objects, and people +differently during training and validation. Experiments with masking background +show that models depend on background bias as their performance decreases for +Kinetics400. However, for Mimetics, which has a weak background bias, masking +the background leads to improved performance even if the background is masked +during validation. Furthermore, masking both the background and objects in +different colors improves performance for SSv2, which has a strong object bias. +These results suggest that masking the background or objects during training +prevents models from overly depending on static bias and makes them focus more +on human action. + +
+
+ comment: In proc. of MMM2025 +
+
+
+
+
+ + ☆ Explicit Eigenvalue Regularization Improves Sharpness-Aware Minimization + + +
+ Sharpness-Aware Minimization (SAM) has attracted significant attention for +its effectiveness in improving generalization across various tasks. However, +its underlying principles remain poorly understood. In this work, we analyze +SAM's training dynamics using the maximum eigenvalue of the Hessian as a +measure of sharpness, and propose a third-order stochastic differential +equation (SDE), which reveals that the dynamics are driven by a complex mixture +of second- and third-order terms. We show that alignment between the +perturbation vector and the top eigenvector is crucial for SAM's effectiveness +in regularizing sharpness, but find that this alignment is often inadequate in +practice, limiting SAM's efficiency. Building on these insights, we introduce +Eigen-SAM, an algorithm that explicitly aims to regularize the top Hessian +eigenvalue by aligning the perturbation vector with the leading eigenvector. We +validate the effectiveness of our theory and the practical advantages of our +proposed approach through comprehensive experiments. Code is available at +https://github.com/RitianLuo/EigenSAM. + +
+
+
+
+
+ + ☆ DWTNeRF: Boosting Few-shot Neural Radiance Fields via Discrete Wavelet + Transform + + +
+ Neural Radiance Fields (NeRF) has achieved superior performance in novel view +synthesis and 3D scene representation, but its practical applications are +hindered by slow convergence and reliance on dense training views. To this end, +we present DWTNeRF, a unified framework based on Instant-NGP's fast-training +hash encoding. It is coupled with regularization terms designed for few-shot +NeRF, which operates on sparse training views. Our DWTNeRF includes a novel +Discrete Wavelet loss that allows explicit prioritization of low frequencies +directly in the training objective, reducing few-shot NeRF's overfitting on +high frequencies in earlier training stages. We additionally introduce a +model-based approach, based on multi-head attention, that is compatible with +INGP-based models, which are sensitive to architectural changes. On the 3-shot +LLFF benchmark, DWTNeRF outperforms Vanilla NeRF by 15.07% in PSNR, 24.45% in +SSIM and 36.30% in LPIPS. Our approach encourages a re-thinking of current +few-shot approaches for INGP-based models. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ Multiple Queries with Multiple Keys: A Precise Prompt Matching Paradigm + for Prompt-based Continual Learning + + +
+ Continual learning requires machine learning models to continuously acquire +new knowledge in dynamic environments while avoiding the forgetting of previous +knowledge. Prompt-based continual learning methods effectively address the +issue of catastrophic forgetting through prompt expansion and selection. +However, existing approaches often suffer from low accuracy in prompt +selection, which can result in the model receiving biased knowledge and making +biased predictions. To address this issue, we propose the Multiple Queries with +Multiple Keys (MQMK) prompt matching paradigm for precise prompt selection. The +goal of MQMK is to select the prompts whose training data distribution most +closely matches that of the test sample. Specifically, Multiple Queries enable +precise breadth search by introducing task-specific knowledge, while Multiple +Keys perform deep search by representing the feature distribution of training +samples at a fine-grained level. Experiments show that MQMK enhances the prompt +matching rate by over 30% in challenging scenarios and achieves +state-of-the-art performance on three widely adopted continual learning +benchmarks. Once this paper is accepted, we will release the code. + +
+
+
+
+
+ + ☆ TeD-Loc: Text Distillation for Weakly Supervised Object Localization + + +
+ Weakly supervised object localization (WSOL) using classification models +trained with only image-class labels remains an important challenge in computer +vision. Given their reliance on classification objectives, traditional WSOL +methods like class activation mapping focus on the most discriminative object +parts, often missing the full spatial extent. In contrast, recent WSOL methods +based on vision-language models like CLIP require ground truth classes or +external classifiers to produce a localization map, limiting their deployment +in downstream tasks. Moreover, methods like GenPromp attempt to address these +issues but introduce considerable complexity due to their reliance on +conditional denoising processes and intricate prompt learning. This paper +introduces Text Distillation for Localization (TeD-Loc), an approach that +directly distills knowledge from CLIP text embeddings into the model backbone +and produces patch-level localization. Multiple instance learning of these +image patches allows for accurate localization and classification using one +model without requiring external classifiers. Such integration of textual and +visual modalities addresses the longstanding challenge of achieving accurate +localization and classification concurrently, as WSOL methods in the literature +typically converge at different epochs. Extensive experiments show that +leveraging text embeddings and localization cues provides a cost-effective WSOL +model. TeD-Loc improves Top-1 LOC accuracy over state-of-the-art models by +about 5% on both CUB and ILSVRC datasets, while significantly reducing +computational complexity compared to GenPromp. + +
+
+
+
+
+ + ☆ Image Motion Blur Removal in the Temporal Dimension with Video Diffusion + Models + + +
+ Most motion deblurring algorithms rely on spatial-domain convolution models, +which struggle with the complex, non-linear blur arising from camera shake and +object motion. In contrast, we propose a novel single-image deblurring approach +that treats motion blur as a temporal averaging phenomenon. Our core innovation +lies in leveraging a pre-trained video diffusion transformer model to capture +diverse motion dynamics within a latent space. It sidesteps explicit kernel +estimation and effectively accommodates diverse motion patterns. We implement +the algorithm within a diffusion-based inverse problem framework. Empirical +results on synthetic and real-world datasets demonstrate that our method +outperforms existing techniques in deblurring complex motion blur scenarios. +This work paves the way for utilizing powerful video diffusion models to +address single-image deblurring challenges. + +
+
+
+
+
+ + ☆ Adapting OpenAI's CLIP Model for Few-Shot Image Inspection in + Manufacturing Quality Control: An Expository Case Study with Multiple + Application Examples + + +
+ This expository paper introduces a simplified approach to image-based quality +inspection in manufacturing using OpenAI's CLIP (Contrastive Language-Image +Pretraining) model adapted for few-shot learning. While CLIP has demonstrated +impressive capabilities in general computer vision tasks, its direct +application to manufacturing inspection presents challenges due to the domain +gap between its training data and industrial applications. We evaluate CLIP's +effectiveness through five case studies: metallic pan surface inspection, 3D +printing extrusion profile analysis, stochastic textured surface evaluation, +automotive assembly inspection, and microstructure image classification. Our +results show that CLIP can achieve high classification accuracy with relatively +small learning sets (50-100 examples per class) for single-component and +texture-based applications. However, the performance degrades with complex +multi-component scenes. We provide a practical implementation framework that +enables quality engineers to quickly assess CLIP's suitability for their +specific applications before pursuing more complex solutions. This work +establishes CLIP-based few-shot learning as an effective baseline approach that +balances implementation simplicity with robust performance, demonstrated in +several manufacturing quality control applications. + +
+
+ comment: 31 pages, 13 figures +
+
+
+
+
+ + ☆ ViDDAR: Vision Language Model-Based Task-Detrimental Content Detection + for Augmented Reality + + +
+ In Augmented Reality (AR), virtual content enhances user experience by +providing additional information. However, improperly positioned or designed +virtual content can be detrimental to task performance, as it can impair users' +ability to accurately interpret real-world information. In this paper we +examine two types of task-detrimental virtual content: obstruction attacks, in +which virtual content prevents users from seeing real-world objects, and +information manipulation attacks, in which virtual content interferes with +users' ability to accurately interpret real-world information. We provide a +mathematical framework to characterize these attacks and create a custom +open-source dataset for attack evaluation. To address these attacks, we +introduce ViDDAR (Vision language model-based Task-Detrimental content Detector +for Augmented Reality), a comprehensive full-reference system that leverages +Vision Language Models (VLMs) and advanced deep learning techniques to monitor +and evaluate virtual content in AR environments, employing a user-edge-cloud +architecture to balance performance with low latency. To the best of our +knowledge, ViDDAR is the first system to employ VLMs for detecting +task-detrimental content in AR settings. Our evaluation results demonstrate +that ViDDAR effectively understands complex scenes and detects task-detrimental +content, achieving up to 92.15% obstruction detection accuracy with a detection +latency of 533 ms, and an 82.46% information manipulation content detection +accuracy with a latency of 9.62 s. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ MEDFORM: A Foundation Model for Contrastive Learning of CT Imaging and + Clinical Numeric Data in Multi-Cancer Analysis + + +
+ Computed tomography (CT) and clinical numeric data are essential modalities +for cancer evaluation, but building large-scale multimodal training datasets +for developing medical foundation models remains challenging due to the +structural complexity of multi-slice CT data and high cost of expert +annotation. In this study, we propose MEDFORM, a multimodal pre-training +strategy that guides CT image representation learning using complementary +information from clinical data for medical foundation model development. +MEDFORM efficiently processes CT slice through multiple instance learning (MIL) +and adopts a dual pre-training strategy: first pretraining the CT slice feature +extractor using SimCLR-based self-supervised learning, then aligning CT and +clinical modalities through cross-modal contrastive learning. Our model was +pre-trained on three different cancer types: lung cancer (141,171 slices), +breast cancer (8,100 slices), colorectal cancer (10,393 slices). The +experimental results demonstrated that this dual pre-training strategy improves +cancer classification performance and maintains robust performance in few-shot +learning scenarios. Code available at +https://github.com/DigitalHealthcareLab/25MultiModalFoundationModel.git + +
+
+ comment: 8 pages, 1 figure +
+
+
+
+
+ + ☆ Multimodal AI on Wound Images and Clinical Notes for Home Patient + Referral + + +
+ Chronic wounds affect 8.5 million Americans, particularly the elderly and +patients with diabetes. These wounds can take up to nine months to heal, making +regular care essential to ensure healing and prevent severe outcomes like limb +amputations. Many patients receive care at home from visiting nurses with +varying levels of wound expertise, leading to inconsistent care. Problematic, +non-healing wounds should be referred to wound specialists, but referral +decisions in non-clinical settings are often erroneous, delayed, or +unnecessary. + This paper introduces the Deep Multimodal Wound Assessment Tool (DM-WAT), a +machine learning framework designed to assist visiting nurses in deciding +whether to refer chronic wound patients. DM-WAT analyzes smartphone-captured +wound images and clinical notes from Electronic Health Records (EHRs). It uses +DeiT-Base-Distilled, a Vision Transformer (ViT), to extract visual features +from images and DeBERTa-base to extract text features from clinical notes. +DM-WAT combines visual and text features using an intermediate fusion approach. +To address challenges posed by a small and imbalanced dataset, it integrates +image and text augmentation with transfer learning to achieve high performance. +In evaluations, DM-WAT achieved 77% with std 3% accuracy and a 70% with std 2% +F1 score, outperforming prior approaches. Score-CAM and Captum interpretation +algorithms provide insights into specific parts of image and text inputs that +influence recommendations, enhancing interpretability and trust. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2208.05051 by other authors +
+
+
+
+
+ + ☆ Revisiting Data Augmentation for Ultrasound Images + + +
+ Data augmentation is a widely used and effective technique to improve the +generalization performance of deep neural networks. Yet, despite often facing +limited data availability when working with medical images, it is frequently +underutilized. This appears to come from a gap in our collective understanding +of the efficacy of different augmentation techniques across different tasks and +modalities. One modality where this is especially true is ultrasound imaging. +This work addresses this gap by analyzing the effectiveness of different +augmentation techniques at improving model performance across a wide range of +ultrasound image analysis tasks. To achieve this, we introduce a new +standardized benchmark of 14 ultrasound image classification and semantic +segmentation tasks from 10 different sources and covering 11 body regions. Our +results demonstrate that many of the augmentations commonly used for tasks on +natural images are also effective on ultrasound images, even more so than +augmentations developed specifically for ultrasound images in some cases. We +also show that diverse augmentation using TrivialAugment, which is widely used +for natural images, is also effective for ultrasound images. Moreover, our +proposed methodology represents a structured approach for assessing various +data augmentations that can be applied to other contexts and modalities. + +
+
+ comment: For associated source code see + https://github.com/adamtupper/ultrasound-augmentation +
+
+
+
+
+ + ☆ Map Prediction and Generative Entropy for Multi-Agent Exploration + + +
+ Traditionally, autonomous reconnaissance applications have acted on explicit +sets of historical observations. Aided by recent breakthroughs in generative +technologies, this work enables robot teams to act beyond what is currently +known about the environment by inferring a distribution of reasonable +interpretations of the scene. We developed a map predictor that inpaints the +unknown space in a multi-agent 2D occupancy map during an exploration mission. +From a comparison of several inpainting methods, we found that a fine-tuned +latent diffusion inpainting model could provide rich and coherent +interpretations of simulated urban environments with relatively little +computation time. By iteratively inferring interpretations of the scene +throughout an exploration run, we are able to identify areas that exhibit high +uncertainty in the prediction, which we formalize with the concept of +generative entropy. We prioritize tasks in regions of high generative entropy, +hypothesizing that this will expedite convergence on an accurate predicted map +of the scene. In our study we juxtapose this new paradigm of task ranking with +the state of the art, which ranks regions to explore by those which maximize +expected information recovery. We compare both of these methods in a simulated +urban environment with three vehicles. Our results demonstrate that by using +our new task ranking method, we can predict a correct scene significantly +faster than with a traditional information-guided method. + +
+
+
+
+
+ + ☆ MONA: Moving Object Detection from Videos Shot by Dynamic Camera + + +
+ Dynamic urban environments, characterized by moving cameras and objects, pose +significant challenges for camera trajectory estimation by complicating the +distinction between camera-induced and object motion. We introduce MONA, a +novel framework designed for robust moving object detection and segmentation +from videos shot by dynamic cameras. MONA comprises two key modules: Dynamic +Points Extraction, which leverages optical flow and tracking any point to +identify dynamic points, and Moving Object Segmentation, which employs adaptive +bounding box filtering, and the Segment Anything for precise moving object +segmentation. We validate MONA by integrating with the camera trajectory +estimation method LEAP-VO, and it achieves state-of-the-art results on the MPI +Sintel dataset comparing to existing methods. These results demonstrate MONA's +effectiveness for moving object detection and its potential in many other +applications in the urban planning field. + +
+
+
+
+
+ + ♻ ☆ Cross-D Conv: Cross-Dimensional Transferable Knowledge Base via Fourier + Shifting Operation + + +
+ In biomedical imaging analysis, the dichotomy between 2D and 3D data presents +a significant challenge. While 3D volumes offer superior real-world +applicability, they are less available for each modality and not easy to train +in large scale, whereas 2D samples are abundant but less comprehensive. This +paper introduces \texttt{Cross-D Conv} operation, a novel approach that bridges +the dimensional gap by learning the phase shifting in the Fourier domain. Our +method enables seamless weight transfer between 2D and 3D convolution +operations, effectively facilitating cross-dimensional learning. The proposed +architecture leverages the abundance of 2D training data to enhance 3D model +performance, offering a practical solution to the multimodal data scarcity +challenge in 3D medical model pretraining. Experimental validation on the +RadImagenet (2D) and multimodal volumetric sets demonstrates that our approach +achieves comparable or superior performance in feature quality assessment. The +enhanced convolution operation presents new opportunities for developing +efficient classification and segmentation models in medical imaging. This work +represents an advancement in cross-dimensional and multimodal medical image +analysis, offering a robust framework for utilizing 2D priors in 3D model +pretraining while maintaining computational efficiency of 2D training. + +
+
+ comment: Accepted for ISBI25; Codes&Weights: + https://github.com/convergedmachine/Cross-D-Conv +
+
+
+
+
+ + ♻ ☆ An Efficient Framework for Crediting Data Contributors of Diffusion + Models + + +
+ As diffusion models are deployed in real-world settings, and their +performance is driven by training data, appraising the contribution of data +contributors is crucial to creating incentives for sharing quality data and to +implementing policies for data compensation. Depending on the use case, model +performance corresponds to various global properties of the distribution +learned by a diffusion model (e.g., overall aesthetic quality). Hence, here we +address the problem of attributing global properties of diffusion models to +data contributors. The Shapley value provides a principled approach to +valuation by uniquely satisfying game-theoretic axioms of fairness. However, +estimating Shapley values for diffusion models is computationally impractical +because it requires retraining on many training data subsets corresponding to +different contributors and rerunning inference. We introduce a method to +efficiently retrain and rerun inference for Shapley value estimation, by +leveraging model pruning and fine-tuning. We evaluate the utility of our method +with three use cases: (i) image quality for a DDPM trained on a CIFAR dataset, +(ii) demographic diversity for an LDM trained on CelebA-HQ, and (iii) aesthetic +quality for a Stable Diffusion model LoRA-finetuned on Post-Impressionist +artworks. Our results empirically demonstrate that our framework can identify +important data contributors across models' global properties, outperforming +existing attribution methods for diffusion models. + +
+
+
+
+
+ + ♻ ☆ VisMin: Visual Minimal-Change Understanding NeurIPS 2024 + + +
+ Fine-grained understanding of objects, attributes, and relationships between +objects is crucial for visual-language models (VLMs). Existing benchmarks +primarily focus on evaluating VLMs' capability to distinguish between two very +similar captions given an image. In this paper, we introduce a new, challenging +benchmark termed Visual Minimal-Change Understanding (VisMin), which requires +models to predict the correct image-caption match given two images and two +captions. The image pair and caption pair contain minimal changes, i.e., only +one aspect changes at a time from among the following: object, attribute, +count, and spatial relation. These changes test the models' understanding of +objects, attributes (such as color, material, shape), counts, and spatial +relationships between objects. We built an automatic framework using large +language models and diffusion models, followed by a rigorous 4-step +verification process by human annotators. Empirical experiments reveal that +current VLMs exhibit notable deficiencies in understanding spatial +relationships and counting abilities. We also generate a large-scale training +dataset to finetune CLIP and Idefics2, showing significant improvements in +fine-grained understanding across benchmarks and in CLIP's general image-text +alignment. We release all resources, including the benchmark, training data, +and finetuned model checkpoints, at https://vismin.net/. + +
+
+ comment: Accepted at NeurIPS 2024. Project URL at https://vismin.net/ +
+
+
+
+
+ + ♻ ☆ GSVC: Efficient Video Representation and Compression Through 2D Gaussian + Splatting + + +
+ 3D Gaussian splats have emerged as a revolutionary, effective, learned +representation for static 3D scenes. In this work, we explore using 2D Gaussian +splats as a new primitive for representing videos. We propose GSVC, an approach +to learning a set of 2D Gaussian splats that can effectively represent and +compress video frames. GSVC incorporates the following techniques: (i) To +exploit temporal redundancy among adjacent frames, which can speed up training +and improve the compression efficiency, we predict the Gaussian splats of a +frame based on its previous frame; (ii) To control the trade-offs between file +size and quality, we remove Gaussian splats with low contribution to the video +quality; (iii) To capture dynamics in videos, we randomly add Gaussian splats +to fit content with large motion or newly-appeared objects; (iv) To handle +significant changes in the scene, we detect key frames based on loss +differences during the learning process. Experiment results show that GSVC +achieves good rate-distortion trade-offs, comparable to state-of-the-art video +codecs such as AV1 and VVC, and a rendering speed of 1500 fps for a 1920x1080 +video. + +
+
+
+
+
+ + ♻ ☆ Towards Interpretable Radiology Report Generation via Concept + Bottlenecks using a Multi-Agentic RAG + + +
+ Deep learning has advanced medical image classification, but interpretability +challenges hinder its clinical adoption. This study enhances interpretability +in Chest X-ray (CXR) classification by using concept bottleneck models (CBMs) +and a multi-agent Retrieval-Augmented Generation (RAG) system for report +generation. By modeling relationships between visual features and clinical +concepts, we create interpretable concept vectors that guide a multi-agent RAG +system to generate radiology reports, enhancing clinical relevance, +explainability, and transparency. Evaluation of the generated reports using an +LLM-as-a-judge confirmed the interpretability and clinical utility of our +model's outputs. On the COVID-QU dataset, our model achieved 81% classification +accuracy and demonstrated robust report generation performance, with five key +metrics ranging between 84% and 90%. This interpretable multi-agent framework +bridges the gap between high-performance AI and the explainability required for +reliable AI-driven CXR analysis in clinical settings. Our code is available at +https://github.com/tifat58/IRR-with-CBM-RAG.git. + +
+
+ comment: Accepted in the 47th European Conference for Information Retrieval + (ECIR) 2025 +
+
+
+
+
+ + ♻ ☆ Condition-Invariant Semantic Segmentation + + +
+ Adaptation of semantic segmentation networks to different visual conditions +is vital for robust perception in autonomous cars and robots. However, previous +work has shown that most feature-level adaptation methods, which employ +adversarial training and are validated on synthetic-to-real adaptation, provide +marginal gains in condition-level adaptation, being outperformed by simple +pixel-level adaptation via stylization. Motivated by these findings, we propose +to leverage stylization in performing feature-level adaptation by aligning the +internal network features extracted by the encoder of the network from the +original and the stylized view of each input image with a novel feature +invariance loss. In this way, we encourage the encoder to extract features that +are already invariant to the style of the input, allowing the decoder to focus +on parsing these features and not on further abstracting from the specific +style of the input. We implement our method, named Condition-Invariant Semantic +Segmentation (CISS), on the current state-of-the-art domain adaptation +architecture and achieve outstanding results on condition-level adaptation. In +particular, CISS sets the new state of the art in the popular +daytime-to-nighttime Cityscapes$\to$Dark Zurich benchmark. Furthermore, our +method achieves the second-best performance on the normal-to-adverse +Cityscapes$\to$ACDC benchmark. CISS is shown to generalize well to domains +unseen during training, such as BDD100K-night and ACDC-night. Code is publicly +available at https://github.com/SysCV/CISS . + +
+
+ comment: IEEE T-PAMI 2025 +
+
+
+
+
+ + ♻ ☆ Locate, Assign, Refine: Taming Customized Promptable Image Inpainting + + +
+ Prior studies have made significant progress in image inpainting guided by +either text description or subject image. However, the research on inpainting +with flexible guidance or control, i.e., text-only, image-only, and their +combination, is still in the early stage. Therefore, in this paper, we +introduce the multimodal promptable image inpainting project: a new task model, +and data for taming customized image inpainting. We propose LAR-Gen, a novel +approach for image inpainting that enables seamless inpainting of specific +region in images corresponding to the mask prompt, incorporating both the text +prompt and image prompt. Our LAR-Gen adopts a coarse-to-fine manner to ensure +the context consistency of source image, subject identity consistency, local +semantic consistency to the text description, and smoothness consistency. It +consists of three mechanisms: (i) Locate mechanism: concatenating the noise +with masked scene image to achieve precise regional editing, (ii) Assign +mechanism: employing decoupled cross-attention mechanism to accommodate +multi-modal guidance, and (iii) Refine mechanism: using a novel RefineNet to +supplement subject details. Additionally, to address the issue of scarce +training data, we introduce a novel data engine to automatically extract +substantial pairs of data consisting of local text prompts and corresponding +visual instances from a vast image data, leveraging publicly available +pre-trained large models. Extensive experiments and various application +scenarios demonstrate the superiority of LAR-Gen in terms of both identity +preservation and text semantic consistency. + +
+
+ comment: 11 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ Boosting Diffusion Guidance via Learning Degradation-Aware Models for + Blind Super Resolution + + +
+ Recently, diffusion-based blind super-resolution (SR) methods have shown +great ability to generate high-resolution images with abundant high-frequency +detail, but the detail is often achieved at the expense of fidelity. Meanwhile, +another line of research focusing on rectifying the reverse process of +diffusion models (i.e., diffusion guidance), has demonstrated the power to +generate high-fidelity results for non-blind SR. However, these methods rely on +known degradation kernels, making them difficult to apply to blind SR. To +address these issues, we present DADiff in this paper. DADiff incorporates +degradation-aware models into the diffusion guidance framework, eliminating the +need to know degradation kernels. Additionally, we propose two novel techniques +-- input perturbation and guidance scalar -- to further improve our +performance. Extensive experimental results show that our proposed method has +superior performance over state-of-the-art methods on blind SR benchmarks. + +
+
+ comment: To appear in WACV 2025. Code is available at: + https://github.com/ryanlu2240/DADiff +
+
+
+
+
+ + ♻ ☆ Pay Attention and Move Better: Harnessing Attention for Interactive + Motion Generation and Training-free Editing + + +
+ This research delves into the problem of interactive editing of human motion +generation. Previous motion diffusion models lack explicit modeling of the +word-level text-motion correspondence and good explainability, hence +restricting their fine-grained editing ability. To address this issue, we +propose an attention-based motion diffusion model, namely MotionCLR, with CLeaR +modeling of attention mechanisms. Technically, MotionCLR models the in-modality +and cross-modality interactions with self-attention and cross-attention, +respectively. More specifically, the self-attention mechanism aims to measure +the sequential similarity between frames and impacts the order of motion +features. By contrast, the cross-attention mechanism works to find the +fine-grained word-sequence correspondence and activate the corresponding +timesteps in the motion sequence. Based on these key properties, we develop a +versatile set of simple yet effective motion editing methods via manipulating +attention maps, such as motion (de-)emphasizing, in-place motion replacement, +and example-based motion generation, etc. For further verification of the +explainability of the attention mechanism, we additionally explore the +potential of action-counting and grounded motion generation ability via +attention maps. Our experimental results show that our method enjoys good +generation and editing ability with good explainability. + +
+
+ comment: Updated MotionCLR technical report +
+
+
+
+
+ + ♻ ☆ Search3D: Hierarchical Open-Vocabulary 3D Segmentation RA-L + + +
+ Open-vocabulary 3D segmentation enables exploration of 3D spaces using +free-form text descriptions. Existing methods for open-vocabulary 3D instance +segmentation primarily focus on identifying object-level instances but struggle +with finer-grained scene entities such as object parts, or regions described by +generic attributes. In this work, we introduce Search3D, an approach to +construct hierarchical open-vocabulary 3D scene representations, enabling 3D +search at multiple levels of granularity: fine-grained object parts, entire +objects, or regions described by attributes like materials. Unlike prior +methods, Search3D shifts towards a more flexible open-vocabulary 3D search +paradigm, moving beyond explicit object-centric queries. For systematic +evaluation, we further contribute a scene-scale open-vocabulary 3D part +segmentation benchmark based on MultiScan, along with a set of open-vocabulary +fine-grained part annotations on ScanNet++. Search3D outperforms baselines in +scene-scale open-vocabulary 3D part segmentation, while maintaining strong +performance in segmenting 3D objects and materials. Our project page is +http://search3d-segmentation.github.io. + +
+
+ comment: This manuscript is provided as a pre-print, it has been accepted for + publication by IEEE RA-L +
+
+
+
+
+ + ♻ ☆ Capsule Vision 2024 Challenge: Multi-Class Abnormality Classification + for Video Capsule Endoscopy + + +
+ We present the Capsule Vision 2024 Challenge: Multi-Class Abnormality +Classification for Video Capsule Endoscopy. It was virtually organized by the +Research Center for Medical Image Analysis and Artificial Intelligence (MIAAI), +Department of Medicine, Danube Private University, Krems, Austria in +collaboration with the 9th International Conference on Computer Vision & Image +Processing (CVIP 2024) being organized by the Indian Institute of Information +Technology, Design and Manufacturing (IIITDM) Kancheepuram, Chennai, India. +This document provides an overview of the challenge, including the registration +process, rules, submission format, description of the datasets used, qualified +team rankings, all team descriptions, and the benchmarking results reported by +the organizers. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ Learning to Mask and Permute Visual Tokens for Vision Transformer + Pre-Training + + +
+ The use of self-supervised pre-training has emerged as a promising approach +to enhance the performance of many different visual tasks. In this context, +recent approaches have employed the Masked Image Modeling paradigm, which +pre-trains a backbone by reconstructing visual tokens associated with randomly +masked image patches. This masking approach, however, introduces noise into the +input data during pre-training, leading to discrepancies that can impair +performance during the fine-tuning phase. Furthermore, input masking neglects +the dependencies between corrupted patches, increasing the inconsistencies +observed in downstream fine-tuning tasks. To overcome these issues, we propose +a new self-supervised pre-training approach, named Masked and Permuted Vision +Transformer (MaPeT), that employs autoregressive and permuted predictions to +capture intra-patch dependencies. In addition, MaPeT employs auxiliary +positional information to reduce the disparity between the pre-training and +fine-tuning phases. In our experiments, we employ a fair setting to ensure +reliable and meaningful comparisons and conduct investigations on multiple +visual tokenizers, including our proposed $k$-CLIP which directly employs +discretized CLIP features. Our results demonstrate that MaPeT achieves +competitive performance on ImageNet, compared to baselines and competitors +under the same model setting. We release an implementation of our code and +models at https://github.com/aimagelab/MaPeT. + +
+
+ comment: Computer Vision and Image Understanding (2025) +
+
+
+
+
+ + ♻ ☆ OmniCount: Multi-label Object Counting with Semantic-Geometric Priors + + +
+ Object counting is pivotal for understanding the composition of scenes. +Previously, this task was dominated by class-specific methods, which have +gradually evolved into more adaptable class-agnostic strategies. However, these +strategies come with their own set of limitations, such as the need for manual +exemplar input and multiple passes for multiple categories, resulting in +significant inefficiencies. This paper introduces a more practical approach +enabling simultaneous counting of multiple object categories using an +open-vocabulary framework. Our solution, OmniCount, stands out by using +semantic and geometric insights (priors) from pre-trained models to count +multiple categories of objects as specified by users, all without additional +training. OmniCount distinguishes itself by generating precise object masks and +leveraging varied interactive prompts via the Segment Anything Model for +efficient counting. To evaluate OmniCount, we created the OmniCount-191 +benchmark, a first-of-its-kind dataset with multi-label object counts, +including points, bounding boxes, and VQA annotations. Our comprehensive +evaluation in OmniCount-191, alongside other leading benchmarks, demonstrates +OmniCount's exceptional performance, significantly outpacing existing +solutions. The project webpage is available at +https://mondalanindya.github.io/OmniCount. + +
+
+ comment: Accepted to AAAI 2025 +
+
+
+
+
+ + ♻ ☆ SANER: Annotation-free Societal Attribute Neutralizer for Debiasing CLIP + + +
+ Large-scale vision-language models, such as CLIP, are known to contain +societal bias regarding protected attributes (e.g., gender, age). This paper +aims to address the problems of societal bias in CLIP. Although previous +studies have proposed to debias societal bias through adversarial learning or +test-time projecting, our comprehensive study of these works identifies two +critical limitations: 1) loss of attribute information when it is explicitly +disclosed in the input and 2) use of the attribute annotations during debiasing +process. To mitigate societal bias in CLIP and overcome these limitations +simultaneously, we introduce a simple-yet-effective debiasing method called +SANER (societal attribute neutralizer) that eliminates attribute information +from CLIP text features only of attribute-neutral descriptions. Experimental +results show that SANER, which does not require attribute annotations and +preserves original information for attribute-specific descriptions, +demonstrates superior debiasing ability than the existing methods. +Additionally, we observe that SANER does not require retraining CLIP from +scratch with the original dataset. Moreover, the debiased model can be directly +applied to the text-to-image generation model by simply replacing the text +encoder. + +
+
+
+
+
+ + ♻ ☆ InternVideo2.5: Empowering Video MLLMs with Long and Rich Context + Modeling + + +
+ This paper aims to improve the performance of video multimodal large language +models (MLLM) via long and rich context (LRC) modeling. As a result, we develop +a new version of InternVideo2.5 with a focus on enhancing the original MLLMs' +ability to perceive fine-grained details and capture long-form temporal +structure in videos. Specifically, our approach incorporates dense vision task +annotations into MLLMs using direct preference optimization and develops +compact spatiotemporal representations through adaptive hierarchical token +compression. Experimental results demonstrate this unique design of LRC greatly +improves the results of video MLLM in mainstream video understanding benchmarks +(short & long), enabling the MLLM to memorize significantly longer video inputs +(at least 6x longer than the original), and master specialized vision +capabilities like object tracking and segmentation. Our work highlights the +importance of multimodal context richness (length and fineness) in empowering +MLLM's innate abilites (focus and memory), providing new insights for future +research on video MLLM. Code and models are available at +https://github.com/OpenGVLab/InternVideo/tree/main/InternVideo2.5 + +
+
+ comment: technical report +
+
+
+
+
+ + ♻ ☆ Hunyuan3D 2.0: Scaling Diffusion Models for High Resolution Textured 3D + Assets Generation + + +
+ We present Hunyuan3D 2.0, an advanced large-scale 3D synthesis system for +generating high-resolution textured 3D assets. This system includes two +foundation components: a large-scale shape generation model -- Hunyuan3D-DiT, +and a large-scale texture synthesis model -- Hunyuan3D-Paint. The shape +generative model, built on a scalable flow-based diffusion transformer, aims to +create geometry that properly aligns with a given condition image, laying a +solid foundation for downstream applications. The texture synthesis model, +benefiting from strong geometric and diffusion priors, produces high-resolution +and vibrant texture maps for either generated or hand-crafted meshes. +Furthermore, we build Hunyuan3D-Studio -- a versatile, user-friendly production +platform that simplifies the re-creation process of 3D assets. It allows both +professional and amateur users to manipulate or even animate their meshes +efficiently. We systematically evaluate our models, showing that Hunyuan3D 2.0 +outperforms previous state-of-the-art models, including the open-source models +and closed-source models in geometry details, condition alignment, texture +quality, and etc. Hunyuan3D 2.0 is publicly released in order to fill the gaps +in the open-source 3D community for large-scale foundation generative models. +The code and pre-trained weights of our models are available at: +https://github.com/Tencent/Hunyuan3D-2 + +
+
+ comment: GitHub link: https://github.com/Tencent/Hunyuan3D-2 +
+
+
+
+
+ + ♻ ☆ Video Depth Anything: Consistent Depth Estimation for Super-Long Videos + + +
+ Depth Anything has achieved remarkable success in monocular depth estimation +with strong generalization ability. However, it suffers from temporal +inconsistency in videos, hindering its practical applications. Various methods +have been proposed to alleviate this issue by leveraging video generation +models or introducing priors from optical flow and camera poses. Nonetheless, +these methods are only applicable to short videos (< 10 seconds) and require a +trade-off between quality and computational efficiency. We propose Video Depth +Anything for high-quality, consistent depth estimation in super-long videos +(over several minutes) without sacrificing efficiency. We base our model on +Depth Anything V2 and replace its head with an efficient spatial-temporal head. +We design a straightforward yet effective temporal consistency loss by +constraining the temporal depth gradient, eliminating the need for additional +geometric priors. The model is trained on a joint dataset of video depth and +unlabeled images, similar to Depth Anything V2. Moreover, a novel +key-frame-based strategy is developed for long video inference. Experiments +show that our model can be applied to arbitrarily long videos without +compromising quality, consistency, or generalization ability. Comprehensive +evaluations on multiple video benchmarks demonstrate that our approach sets a +new state-of-the-art in zero-shot video depth estimation. We offer models of +different scales to support a range of scenarios, with our smallest model +capable of real-time performance at 30 FPS. + +
+
+ comment: Project page: https://videodepthanything.github.io/ +
+
+
+
+
+ + ♻ ☆ PairingNet: A Learning-based Pair-searching and -matching Network for + Image Fragments + + +
+ In this paper, we propose a learning-based image fragment pair-searching and +-matching approach to solve the challenging restoration problem. Existing works +use rule-based methods to match similar contour shapes or textures, which are +always difficult to tune hyperparameters for extensive data and computationally +time-consuming. Therefore, we propose a neural network that can effectively +utilize neighbor textures with contour shape information to fundamentally +improve performance. First, we employ a graph-based network to extract the +local contour and texture features of fragments. Then, for the pair-searching +task, we adopt a linear transformer-based module to integrate these local +features and use contrastive loss to encode the global features of each +fragment. For the pair-matching task, we design a weighted fusion module to +dynamically fuse extracted local contour and texture features, and formulate a +similarity matrix for each pair of fragments to calculate the matching score +and infer the adjacent segment of contours. To faithfully evaluate our proposed +network, we created a new image fragment dataset through an algorithm we +designed that tears complete images into irregular fragments. The experimental +results show that our proposed network achieves excellent pair-searching +accuracy, reduces matching errors, and significantly reduces computational +time. Details, sourcecode, and data are available in our supplementary +material. + +
+
+ comment: 25 pages, 19 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ An Embedding is Worth a Thousand Noisy Labels + + +
+ The performance of deep neural networks scales with dataset size and label +quality, rendering the efficient mitigation of low-quality data annotations +crucial for building robust and cost-effective systems. Existing strategies to +address label noise exhibit severe limitations due to computational complexity +and application dependency. In this work, we propose WANN, a Weighted Adaptive +Nearest Neighbor approach that builds on self-supervised feature +representations obtained from foundation models. To guide the weighted voting +scheme, we introduce a reliability score, which measures the likelihood of a +data label being correct. WANN outperforms reference methods, including a +linear layer trained with robust loss functions, on diverse datasets of varying +size and under various noise types and severities. WANN also exhibits superior +generalization on imbalanced data compared to both Adaptive-NNs (ANN) and fixed +k-NNs. Furthermore, the proposed weighting scheme enhances supervised +dimensionality reduction under noisy labels. This yields a significant boost in +classification performance with 10x and 100x smaller image embeddings, +minimizing latency and storage requirements. Our approach, emphasizing +efficiency and explainability, emerges as a simple, robust solution to overcome +inherent limitations of deep neural network training. The code is available at +https://github.com/francescodisalvo05/wann-noisy-labels . + +
+
+ comment: Preprint - Under Review +
+
+
+
+
+ + ♻ ☆ Treatment-aware Diffusion Probabilistic Model for Longitudinal MRI + Generation and Diffuse Glioma Growth Prediction + + +
+ Diffuse gliomas are malignant brain tumors that grow widespread through the +brain. The complex interactions between neoplastic cells and normal tissue, as +well as the treatment-induced changes often encountered, make glioma tumor +growth modeling challenging. In this paper, we present a novel end-to-end +network capable of future predictions of tumor masks and multi-parametric +magnetic resonance images (MRI) of how the tumor will look at any future time +points for different treatment plans. Our approach is based on cutting-edge +diffusion probabilistic models and deep-segmentation neural networks. We +included sequential multi-parametric MRI and treatment information as +conditioning inputs to guide the generative diffusion process as well as a +joint segmentation process. This allows for tumor growth estimates and +realistic MRI generation at any given treatment and time point. We trained the +model using real-world postoperative longitudinal MRI data with glioma tumor +growth trajectories represented as tumor segmentation maps over time. The model +demonstrates promising performance across various tasks, including generating +high-quality multi-parametric MRI with tumor masks, performing time-series +tumor segmentations, and providing uncertainty estimates. Combined with the +treatment-aware generated MRI, the tumor growth predictions with uncertainty +estimates can provide useful information for clinical decision-making. + +
+
+ comment: preprints in IEEE-TMI, 14 pages +
+
+
+
+
+ + ♻ ☆ A CNN-Transformer for Classification of Longitudinal 3D MRI Images -- A + Case Study on Hepatocellular Carcinoma Prediction + + +
+ Longitudinal MRI analysis is crucial for predicting disease outcomes, +particularly in chronic conditions like hepatocellular carcinoma (HCC), where +early detection can significantly influence treatment strategies and patient +prognosis. Yet, due to challenges like limited data availability, subtle +parenchymal changes, and the irregular timing of medical screenings, current +approaches have so far focused on cross-sectional imaging data. To address +this, we propose HCCNet, a novel model architecture that integrates a 3D +adaptation of the ConvNeXt CNN architecture with a Transformer encoder, +capturing both the intricate spatial features of 3D MRIs and the complex +temporal dependencies across different time points. HCCNet utilizes a two-stage +pre-training process tailored for longitudinal MRI data. The CNN backbone is +pre-trained using a self-supervised learning framework adapted for 3D MRIs, +while the Transformer encoder is pre-trained with a sequence-order-prediction +task to enhance its understanding of disease progression over time. We +demonstrate the effectiveness of HCCNet by applying it to a cohort of liver +cirrhosis patients undergoing regular MRI screenings for HCC surveillance. Our +results show that HCCNet significantly improves predictive accuracy and +reliability over baseline models, providing a robust tool for personalized HCC +surveillance. The methodological approach presented in this paper is versatile +and can be adapted to various longitudinal MRI screening applications. Its +ability to handle varying patient record lengths and irregular screening +intervals establishes it as an invaluable framework for monitoring chronic +diseases, where timely and accurate disease prognosis is critical for effective +treatment planning. + +
+
+ comment: Submitted for publication to Biomedical Signal Processing and + Control; Incorrect notation corrected +
+
+
+
+
+ + ♻ ☆ Predicate Debiasing in Vision-Language Models Integration for Scene + Graph Generation Enhancement + + +
+ Scene Graph Generation (SGG) provides basic language representation of visual +scenes, requiring models to grasp complex and diverse semantics between +objects. This complexity and diversity in SGG leads to underrepresentation, +where parts of triplet labels are rare or even unseen during training, +resulting in imprecise predictions. To tackle this, we propose integrating the +pretrained Vision-language Models to enhance representation. However, due to +the gap between pretraining and SGG, direct inference of pretrained VLMs on SGG +leads to severe bias, which stems from the imbalanced predicates distribution +in the pretraining language set. To alleviate the bias, we introduce a novel LM +Estimation to approximate the unattainable predicates distribution. Finally, we +ensemble the debiased VLMs with SGG models to enhance the representation, where +we design a certainty-aware indicator to score each sample and dynamically +adjust the ensemble weights. Our training-free method effectively addresses the +predicates bias in pretrained VLMs, enhances SGG's representation, and +significantly improve the performance. + +
+
+
+
+
+ + ♻ ☆ PDPP: Projected Diffusion for Procedure Planning in Instructional Videos CVPR 2023 + + +
+ In this paper, we study the problem of procedure planning in instructional +videos, which aims to make a plan (i.e. a sequence of actions) given the +current visual observation and the desired goal. Previous works cast this as a +sequence modeling problem and leverage either intermediate visual observations +or language instructions as supervision to make autoregressive planning, +resulting in complex learning schemes and expensive annotation costs. To avoid +intermediate supervision annotation and error accumulation caused by planning +autoregressively, we propose a diffusion-based framework, coined as PDPP, to +directly model the whole action sequence distribution with task label as +supervision instead. Our core idea is to treat procedure planning as a +distribution fitting problem under the given observations, thus transform the +planning problem to a sampling process from this distribution during inference. +The diffusion-based modeling approach also effectively addresses the +uncertainty issue in procedure planning. Based on PDPP, we further apply joint +training to our framework to generate plans with varying horizon lengths using +a single model and reduce the number of training parameters required. We +instantiate our PDPP with three popular diffusion models and investigate a +series of condition-introducing methods in our framework, including condition +embeddings, MoEs, two-stage prediction and Classifier-Free Guidance strategy. +Finally, we apply our PDPP to the Visual Planners for human Assistance problem +which requires the goal specified in natural language rather than visual +observation. We conduct experiments on challenging datasets of different scales +and our PDPP model achieves the state-of-the-art performance on multiple +metrics, even compared with those strongly-supervised counterparts. These +results further demonstrates the effectiveness and generalization ability of +our model. + +
+
+ comment: Accepted as a highlight paper at CVPR 2023. Extension accepted by + TPAMI. Code and trained models are available at + https://github.com/MCG-NJU/PDPP +
+
+
+
+
+ + ♻ ☆ Adaptive Retention & Correction for Continual Learning ICLR 2025 + + +
+ Continual learning, also known as lifelong learning or incremental learning, +refers to the process by which a model learns from a stream of incoming data +over time. A common problem in continual learning is the classification layer's +bias towards the most recent task. Traditionally, methods have relied on +incorporating data from past tasks during training to mitigate this issue. +However, the recent shift in continual learning to memory-free environments has +rendered these approaches infeasible. In this study, we propose a solution +focused on the testing phase. We first introduce a simple Out-of-Task Detection +method, OTD, designed to accurately identify samples from past tasks during +testing. Leveraging OTD, we then propose: (1) an Adaptive Retention mechanism +for dynamically tuning the classifier layer on past task data; (2) an Adaptive +Correction mechanism for revising predictions when the model classifies data +from previous tasks into classes from the current task. We name our approach +Adaptive Retention & Correction (ARC). While designed for memory-free +environments, ARC also proves effective in memory-based settings. Extensive +experiments show that our proposed method can be plugged in to virtually any +existing continual learning approach without requiring any modifications to its +training procedure. Specifically, when integrated with state-of-the-art +approaches, ARC achieves an average performance increase of 2.7% and 2.6% on +the CIFAR-100 and Imagenet-R datasets, respectively. + +
+
+ comment: ICLR 2025 +
+
+
+
+
+ + ♻ ☆ InDistill: Information flow-preserving knowledge distillation for model + compression + + +
+ In this paper, we introduce InDistill, a method that serves as a warmup stage +for enhancing Knowledge Distillation (KD) effectiveness. InDistill focuses on +transferring critical information flow paths from a heavyweight teacher to a +lightweight student. This is achieved via a training scheme based on curriculum +learning that considers the distillation difficulty of each layer and the +critical learning periods when the information flow paths are established. This +procedure can lead to a student model that is better prepared to learn from the +teacher. To ensure the applicability of InDistill across a wide range of +teacher-student pairs, we also incorporate a pruning operation when there is a +discrepancy in the width of the teacher and student layers. This pruning +operation reduces the width of the teacher's intermediate layers to match those +of the student, allowing direct distillation without the need for an encoding +stage. The proposed method is extensively evaluated using various pairs of +teacher-student architectures on CIFAR-10, CIFAR-100, and ImageNet datasets +demonstrating that preserving the information flow paths consistently increases +the performance of the baseline KD approaches on both classification and +retrieval settings. The code is available at +https://github.com/gsarridis/InDistill. + +
+
+
+
+
+ + ♻ ☆ UrbanVLP: Multi-Granularity Vision-Language Pretraining for Urban + Socioeconomic Indicator Prediction + + +
+ Urban socioeconomic indicator prediction aims to infer various metrics +related to sustainable development in diverse urban landscapes using +data-driven methods. However, prevalent pretrained models, particularly those +reliant on satellite imagery, face dual challenges. Firstly, concentrating +solely on macro-level patterns from satellite data may introduce bias, lacking +nuanced details at micro levels, such as architectural details at a place. +Secondly, the text generated by the precursor work UrbanCLIP, which fully +utilizes the extensive knowledge of LLMs, frequently exhibits issues such as +hallucination and homogenization, resulting in a lack of reliable quality. In +response to these issues, we devise a novel framework entitled UrbanVLP based +on Vision-Language Pretraining. Our UrbanVLP seamlessly integrates +multi-granularity information from both macro (satellite) and micro +(street-view) levels, overcoming the limitations of prior pretrained models. +Moreover, it introduces automatic text generation and calibration, providing a +robust guarantee for producing high-quality text descriptions of urban imagery. +Rigorous experiments conducted across six socioeconomic indicator prediction +tasks underscore its superior performance. + +
+
+ comment: Accepted as a full paper by AAAI'25 - AI for Social Impact Track +
+
+
+
+
+ + ♻ ☆ SpatialCoT: Advancing Spatial Reasoning through Coordinate Alignment and + Chain-of-Thought for Embodied Task Planning + + +
+ Spatial reasoning is an essential problem in embodied AI research. Efforts to +enhance spatial reasoning abilities through supplementary spatial data and +fine-tuning have proven limited and ineffective when addressing complex +embodied tasks, largely due to their dependence on language-based outputs. +While some approaches have introduced a point-based action space to mitigate +this issue, they fall short in managing more intricate tasks within complex +environments. This deficiency arises from their failure to fully exploit the +inherent thinking and reasoning capabilities that are fundamental strengths of +Vision-Language Models (VLMs). To address these limitations, we propose a novel +approach named SpatialCoT, specifically designed to bolster the spatial +reasoning capabilities of VLMs. Our approach comprises two stages: spatial +coordinate bi-directional alignment, which aligns vision-language inputs with +spatial coordinates, and chain-of-thought spatial grounding, which harnesses +the reasoning capabilities of language models for advanced spatial reasoning. +We evaluate SpatialCoT on challenging navigation and manipulation tasks, both +in simulation and real-world settings. Experimental results demonstrate that +our method significantly outperforms previous state-of-the-art approaches in +both tasks. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ Generalizable Prompt Tuning for Vision-Language Models + + +
+ Prompt tuning for vision-language models such as CLIP involves optimizing the +text prompts used to generate image-text pairs for specific downstream tasks. +While hand-crafted or template-based prompts are generally applicable to a +wider range of unseen classes, they tend to perform poorly in downstream tasks +(i.e., seen classes). Learnable soft prompts, on the other hand, often perform +well in downstream tasks but lack generalizability. Additionally, prior +research has predominantly concentrated on the textual modality, with very few +studies attempting to explore the prompt's generalization potential from the +visual modality. Keeping these limitations in mind, we investigate how to +prompt tuning to obtain both a competitive downstream performance and +generalization. The study shows that by treating soft and hand-crafted prompts +as dual views of the textual modality, and maximizing their mutual information, +we can better ensemble task-specific and general semantic information. +Moreover, to generate more expressive prompts, the study introduces a +class-wise augmentation from the visual modality, resulting in significant +robustness to a wider range of unseen classes. Extensive evaluations on several +benchmarks report that the proposed approach achieves competitive results in +terms of both task-specific performance and general abilities. + +
+
+ comment: in progress +
+
+
+
+
+ + ♻ ☆ MD-Dose: A diffusion model based on the Mamba for radiation dose + prediction + + +
+ Radiation therapy is crucial in cancer treatment. Experienced experts +typically iteratively generate high-quality dose distribution maps, forming the +basis for excellent radiation therapy plans. Therefore, automated prediction of +dose distribution maps is significant in expediting the treatment process and +providing a better starting point for developing radiation therapy plans. With +the remarkable results of diffusion models in predicting high-frequency regions +of dose distribution maps, dose prediction methods based on diffusion models +have been extensively studied. However, existing methods mainly utilize CNNs or +Transformers as denoising networks. CNNs lack the capture of global receptive +fields, resulting in suboptimal prediction performance. Transformers excel in +global modeling but face quadratic complexity with image size, resulting in +significant computational overhead. To tackle these challenges, we introduce a +novel diffusion model, MD-Dose, based on the Mamba architecture for predicting +radiation therapy dose distribution in thoracic cancer patients. In the forward +process, MD-Dose adds Gaussian noise to dose distribution maps to obtain pure +noise images. In the backward process, MD-Dose utilizes a noise predictor based +on the Mamba to predict the noise, ultimately outputting the dose distribution +maps. Furthermore, We develop a Mamba encoder to extract structural information +and integrate it into the noise predictor for localizing dose regions in the +planning target volume (PTV) and organs at risk (OARs). Through extensive +experiments on a dataset of 300 thoracic tumor patients, we showcase the +superiority of MD-Dose in various metrics and time consumption. + +
+
+
+
+
+ + ♻ ☆ ParaHome: Parameterizing Everyday Home Activities Towards 3D Generative + Modeling of Human-Object Interactions + + +
+ To enable machines to understand the way humans interact with the physical +world in daily life, 3D interaction signals should be captured in natural +settings, allowing people to engage with multiple objects in a range of +sequential and casual manipulations. To achieve this goal, we introduce our +ParaHome system designed to capture dynamic 3D movements of humans and objects +within a common home environment. Our system features a multi-view setup with +70 synchronized RGB cameras, along with wearable motion capture devices +including an IMU-based body suit and hand motion capture gloves. By leveraging +the ParaHome system, we collect a new human-object interaction dataset, +including 486 minutes of sequences across 207 captures with 38 participants, +offering advancements with three key aspects: (1) capturing body motion and +dexterous hand manipulation motion alongside multiple objects within a +contextual home environment; (2) encompassing sequential and concurrent +manipulations paired with text descriptions; and (3) including articulated +objects with multiple parts represented by 3D parameterized models. We present +detailed design justifications for our system, and perform key generative +modeling experiments to demonstrate the potential of our dataset. + +
+
+
+
+
+ + ♻ ☆ Out of Length Text Recognition with Sub-String Matching + + +
+ Scene Text Recognition (STR) methods have demonstrated robust performance in +word-level text recognition. However, in real applications the text image is +sometimes long due to detected with multiple horizontal words. It triggers the +requirement to build long text recognition models from readily available short +(i.e., word-level) text datasets, which has been less studied previously. In +this paper, we term this task Out of Length (OOL) text recognition. We +establish the first Long Text Benchmark (LTB) to facilitate the assessment of +different methods in long text recognition. Meanwhile, we propose a novel +method called OOL Text Recognition with sub-String Matching (SMTR). SMTR +comprises two cross-attention-based modules: one encodes a sub-string +containing multiple characters into next and previous queries, and the other +employs the queries to attend to the image features, matching the sub-string +and simultaneously recognizing its next and previous character. SMTR can +recognize text of arbitrary length by iterating the process above. To avoid +being trapped in recognizing highly similar sub-strings, we introduce a +regularization training to compel SMTR to effectively discover subtle +differences between similar sub-strings for precise matching. In addition, we +propose an inference augmentation strategy to alleviate confusion caused by +identical sub-strings in the same text and improve the overall recognition +efficiency. Extensive experimental results reveal that SMTR, even when trained +exclusively on short text, outperforms existing methods in public short text +benchmarks and exhibits a clear advantage on LTB. Code: +https://github.com/Topdu/OpenOCR. + +
+
+ comment: Accepted by AAAI2025 +
+
+
+
+
+ + ♻ ☆ HAC++: Towards 100X Compression of 3D Gaussian Splatting ECCV 2024 + + +
+ 3D Gaussian Splatting (3DGS) has emerged as a promising framework for novel +view synthesis, boasting rapid rendering speed with high fidelity. However, the +substantial Gaussians and their associated attributes necessitate effective +compression techniques. Nevertheless, the sparse and unorganized nature of the +point cloud of Gaussians (or anchors in our paper) presents challenges for +compression. To achieve a compact size, we propose HAC++, which leverages the +relationships between unorganized anchors and a structured hash grid, utilizing +their mutual information for context modeling. Additionally, HAC++ captures +intra-anchor contextual relationships to further enhance compression +performance. To facilitate entropy coding, we utilize Gaussian distributions to +precisely estimate the probability of each quantized attribute, where an +adaptive quantization module is proposed to enable high-precision quantization +of these attributes for improved fidelity restoration. Moreover, we incorporate +an adaptive masking strategy to eliminate invalid Gaussians and anchors. +Overall, HAC++ achieves a remarkable size reduction of over 100X compared to +vanilla 3DGS when averaged on all datasets, while simultaneously improving +fidelity. It also delivers more than 20X size reduction compared to +Scaffold-GS. Our code is available at +https://github.com/YihangChen-ee/HAC-plus. + +
+
+ comment: Project Page: https://yihangchen-ee.github.io/project_hac++/ Code: + https://github.com/YihangChen-ee/HAC-plus. This paper is a journal extension + of HAC at arXiv:2403.14530 (ECCV 2024) +
+
+
+
+
+ + ♻ ☆ Make VLM Recognize Visual Hallucination on Cartoon Character Image with + Pose Information + + +
+ Leveraging large-scale Text-to-Image (TTI) models have become a common +technique for generating exemplar or training dataset in the fields of image +synthesis, video editing, 3D reconstruction. However, semantic structural +visual hallucinations involving perceptually severe defects remain a concern, +especially in the domain of non-photorealistic rendering (NPR) such as cartoons +and pixelization-style character. To detect these hallucinations in NPR, We +propose a novel semantic structural hallucination detection system using +Vision-Language Model (VLM). Our approach is to leverage the emerging +capability of large language model, in-context learning which denotes that VLM +has seen some examples by user for specific downstream task, here hallucination +detection. Based on in-context learning, we introduce pose-aware in-context +visual learning (PA-ICVL) which improve the overall performance of VLM by +further inputting visual data beyond prompts, RGB images and pose information. +By incorporating pose guidance, we enable VLMs to make more accurate decisions. +Experimental results demonstrate significant improvements in identifying visual +hallucinations compared to baseline methods relying solely on RGB images. +Within selected two VLMs, GPT-4v, Gemini pro vision, our proposed PA-ICVL +improves the hallucination detection with 50% to 78%, 57% to 80%, respectively. +This research advances a capability of TTI models toward real-world +applications by mitigating visual hallucinations via in-context visual +learning, expanding their potential in non-photorealistic domains. In addition, +it showcase how users can boost the downstream-specialized capability of open +VLM by harnessing additional conditions. We collect synthetic +cartoon-hallucination dataset with TTI models, this dataset and final tuned VLM +will be publicly available. + +
+
+ comment: Accepted at WACV 2025, Project page: + https://gh-bumsookim.github.io/Cartoon-Hallucinations-Detection/ +
+
+
+
+
+ + ♻ ☆ GPS as a Control Signal for Image Generation + + +
+ We show that the GPS tags contained in photo metadata provide a useful +control signal for image generation. We train GPS-to-image models and use them +for tasks that require a fine-grained understanding of how images vary within a +city. In particular, we train a diffusion model to generate images conditioned +on both GPS and text. The learned model generates images that capture the +distinctive appearance of different neighborhoods, parks, and landmarks. We +also extract 3D models from 2D GPS-to-image models through score distillation +sampling, using GPS conditioning to constrain the appearance of the +reconstruction from each viewpoint. Our evaluations suggest that our +GPS-conditioned models successfully learn to generate images that vary based on +location, and that GPS conditioning improves estimated 3D structure. + +
+
+ comment: Project page: https://cfeng16.github.io/gps-gen/ +
+
+
+
+
+ + ♻ ☆ Teacher Encoder-Student Decoder Denoising Guided Segmentation Network + for Anomaly Detection + + +
+ Visual anomaly detection is a highly challenging task, often categorized as a +one-class classification and segmentation problem. Recent studies have +demonstrated that the student-teacher (S-T) framework effectively addresses +this challenge. However, most S-T frameworks rely solely on pre-trained teacher +networks to guide student networks in learning multi-scale similar features, +overlooking the potential of the student networks to enhance learning through +multi-scale feature fusion. In this study, we propose a novel model named +PFADSeg, which integrates a pre-trained teacher network, a denoising student +network with multi-scale feature fusion, and a guided anomaly segmentation +network into a unified framework. By adopting a unique teacher-encoder and +student-decoder denoising mode, the model improves the student network's +ability to learn from teacher network features. Furthermore, an adaptive +feature fusion mechanism is introduced to train a self-supervised segmentation +network that synthesizes anomaly masks autonomously, significantly increasing +detection performance. Evaluated on the MVTec AD dataset, PFADSeg achieves +state-of-the-art results with an image-level AUC of 98.9%, a pixel-level mean +precision of 76.4%, and an instance-level mean precision of 78.7%. + +
+
+
+
+
+ + ♻ ☆ Open-MAGVIT2: An Open-Source Project Toward Democratizing + Auto-regressive Visual Generation + + +
+ We present Open-MAGVIT2, a family of auto-regressive image generation models +ranging from 300M to 1.5B. The Open-MAGVIT2 project produces an open-source +replication of Google's MAGVIT-v2 tokenizer, a tokenizer with a super-large +codebook (i.e., $2^{18}$ codes), and achieves the state-of-the-art +reconstruction performance (1.17 rFID) on ImageNet $256 \times 256$. +Furthermore, we explore its application in plain auto-regressive models and +validate scalability properties. To assist auto-regressive models in predicting +with a super-large vocabulary, we factorize it into two sub-vocabulary of +different sizes by asymmetric token factorization, and further introduce "next +sub-token prediction" to enhance sub-token interaction for better generation +quality. We release all models and codes to foster innovation and creativity in +the field of auto-regressive visual generation. + +
+
+
+
+
+ + ♻ On Learning Multi-Modal Forgery Representation for Diffusion Generated + Video Detection + + +
+ Large numbers of synthesized videos from diffusion models pose threats to +information security and authenticity, leading to an increasing demand for +generated content detection. However, existing video-level detection algorithms +primarily focus on detecting facial forgeries and often fail to identify +diffusion-generated content with a diverse range of semantics. To advance the +field of video forensics, we propose an innovative algorithm named Multi-Modal +Detection(MM-Det) for detecting diffusion-generated videos. MM-Det utilizes the +profound perceptual and comprehensive abilities of Large Multi-modal Models +(LMMs) by generating a Multi-Modal Forgery Representation (MMFR) from LMM's +multi-modal space, enhancing its ability to detect unseen forgery content. +Besides, MM-Det leverages an In-and-Across Frame Attention (IAFA) mechanism for +feature augmentation in the spatio-temporal domain. A dynamic fusion strategy +helps refine forgery representations for the fusion. Moreover, we construct a +comprehensive diffusion video dataset, called Diffusion Video Forensics (DVF), +across a wide range of forgery videos. MM-Det achieves state-of-the-art +performance in DVF, demonstrating the effectiveness of our algorithm. Both +source code and DVF are available at https://github.com/SparkleXFantasy/MM-Det. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ SARATR-X: Toward Building A Foundation Model for SAR Target Recognition + + +
+ Despite the remarkable progress in synthetic aperture radar automatic target +recognition (SAR ATR), recent efforts have concentrated on detecting and +classifying a specific category, e.g., vehicles, ships, airplanes, or +buildings. One of the fundamental limitations of the top-performing SAR ATR +methods is that the learning paradigm is supervised, task-specific, +limited-category, closed-world learning, which depends on massive amounts of +accurately annotated samples that are expensively labeled by expert SAR +analysts and have limited generalization capability and scalability. In this +work, we make the first attempt towards building a foundation model for SAR +ATR, termed SARATR-X. SARATR-X learns generalizable representations via +self-supervised learning (SSL) and provides a cornerstone for label-efficient +model adaptation to generic SAR target detection and classification tasks. +Specifically, SARATR-X is trained on 0.18 M unlabelled SAR target samples, +which are curated by combining contemporary benchmarks and constitute the +largest publicly available dataset till now. Considering the characteristics of +SAR images, a backbone tailored for SAR ATR is carefully designed, and a +two-step SSL method endowed with multi-scale gradient features was applied to +ensure the feature diversity and model scalability of SARATR-X. The +capabilities of SARATR-X are evaluated on classification under few-shot and +robustness settings and detection across various categories and scenes, and +impressive performance is achieved, often competitive with or even superior to +prior fully supervised, semi-supervised, or self-supervised algorithms. Our +SARATR-X and the curated dataset are released at +https://github.com/waterdisappear/SARATR-X to foster research into foundation +models for SAR image interpretation. + +
+
+ comment: 20 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Inferring Past Human Actions in Homes with Abductive Reasoning + + +
+ Abductive reasoning aims to make the most likely inference for a given set of +incomplete observations. In this paper, we introduce "Abductive Past Action +Inference", a novel research task aimed at identifying the past actions +performed by individuals within homes to reach specific states captured in a +single image, using abductive inference. The research explores three key +abductive inference problems: past action set prediction, past action sequence +prediction, and abductive past action verification. We introduce several models +tailored for abductive past action inference, including a relational graph +neural network, a relational bilinear pooling model, and a relational +transformer model. Notably, the newly proposed object-relational bilinear graph +encoder-decoder (BiGED) model emerges as the most effective among all methods +evaluated, demonstrating good proficiency in handling the intricacies of the +Action Genome dataset. The contributions of this research significantly advance +the ability of deep learning models to reason about current scene evidence and +make highly plausible inferences about past human actions. This advancement +enables a deeper understanding of events and behaviors, which can enhance +decision-making and improve system capabilities across various real-world +applications such as Human-Robot Interaction and Elderly Care and Health +Monitoring. Code and data available at https://github.com/LUNAProject22/AAR + +
+
+ comment: 15 pages, 8 figures, Accepted to WACV 2025 +
+
+
+
+
+ + ♻ ☆ Vector-Symbolic Architecture for Event-Based Optical Flow + + +
+ From a perspective of feature matching, optical flow estimation for event +cameras involves identifying event correspondences by comparing feature +similarity across accompanying event frames. In this work, we introduces an +effective and robust high-dimensional (HD) feature descriptor for event frames, +utilizing Vector Symbolic Architectures (VSA). The topological similarity among +neighboring variables within VSA contributes to the enhanced representation +similarity of feature descriptors for flow-matching points, while its +structured symbolic representation capacity facilitates feature fusion from +both event polarities and multiple spatial scales. Based on this HD feature +descriptor, we propose a novel feature matching framework for event-based +optical flow, encompassing both model-based (VSA-Flow) and self-supervised +learning (VSA-SM) methods. In VSA-Flow, accurate optical flow estimation +validates the effectiveness of HD feature descriptors. In VSA-SM, a novel +similarity maximization method based on the HD feature descriptor is proposed +to learn optical flow in a self-supervised way from events alone, eliminating +the need for auxiliary grayscale images. Evaluation results demonstrate that +our VSA-based method achieves superior accuracy in comparison to both +model-based and self-supervised learning methods on the DSEC benchmark, while +remains competitive among both methods on the MVSEC benchmark. This +contribution marks a significant advancement in event-based optical flow within +the feature matching methodology. + +
+
+
+
+
+ + ♻ ☆ CogMorph: Cognitive Morphing Attacks for Text-to-Image Models + + +
+ The development of text-to-image (T2I) generative models, that enable the +creation of high-quality synthetic images from textual prompts, has opened new +frontiers in creative design and content generation. However, this paper +reveals a significant and previously unrecognized ethical risk inherent in this +technology and introduces a novel method, termed the Cognitive Morphing Attack +(CogMorph), which manipulates T2I models to generate images that retain the +original core subjects but embeds toxic or harmful contextual elements. This +nuanced manipulation exploits the cognitive principle that human perception of +concepts is shaped by the entire visual scene and its context, producing images +that amplify emotional harm far beyond attacks that merely preserve the +original semantics. To address this, we first construct an imagery toxicity +taxonomy spanning 10 major and 48 sub-categories, aligned with human +cognitive-perceptual dimensions, and further build a toxicity risk matrix +resulting in 1,176 high-quality T2I toxic prompts. Based on this, our CogMorph +first introduces Cognitive Toxicity Augmentation, which develops a cognitive +toxicity knowledge base with rich external toxic representations for humans +(e.g., fine-grained visual features) that can be utilized to further guide the +optimization of adversarial prompts. In addition, we present Contextual +Hierarchical Morphing, which hierarchically extracts critical parts of the +original prompt (e.g., scenes, subjects, and body parts), and then iteratively +retrieves and fuses toxic features to inject harmful contexts. Extensive +experiments on multiple open-sourced T2I models and black-box commercial APIs +(e.g., DALLE-3) demonstrate the efficacy of CogMorph which significantly +outperforms other baselines by large margins (+20.62% on average). + +
+
+
+
+
+ + ♻ ☆ OpenEarthMap-SAR: A Benchmark Synthetic Aperture Radar Dataset for + Global High-Resolution Land Cover Mapping + + +
+ High-resolution land cover mapping plays a crucial role in addressing a wide +range of global challenges, including urban planning, environmental monitoring, +disaster response, and sustainable development. However, creating accurate, +large-scale land cover datasets remains a significant challenge due to the +inherent complexities of geospatial data, such as diverse terrain, varying +sensor modalities, and atmospheric conditions. Synthetic Aperture Radar (SAR) +imagery, with its ability to penetrate clouds and capture data in all-weather, +day-and-night conditions, offers unique advantages for land cover mapping. +Despite these strengths, the lack of benchmark datasets tailored for SAR +imagery has limited the development of robust models specifically designed for +this data modality. To bridge this gap and facilitate advancements in SAR-based +geospatial analysis, we introduce OpenEarthMap-SAR, a benchmark SAR dataset, +for global high-resolution land cover mapping. OpenEarthMap-SAR consists of 1.5 +million segments of 5033 aerial and satellite images with the size of +1024$\times$1024 pixels, covering 35 regions from Japan, France, and the USA, +with partially manually annotated and fully pseudo 8-class land cover labels at +a ground sampling distance of 0.15--0.5 m. We evaluated the performance of +state-of-the-art methods for semantic segmentation and present challenging +problem settings suitable for further technical development. The dataset also +serves the official dataset for IEEE GRSS Data Fusion Contest Track I. The +dataset has been made publicly available at +https://zenodo.org/records/14622048. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ SpeechAct: Towards Generating Whole-body Motion from Speech + + +
+ This paper addresses the problem of generating whole-body motion from speech. +Despite great successes, prior methods still struggle to produce reasonable and +diverse whole-body motions from speech. This is due to their reliance on +suboptimal representations and a lack of strategies for generating diverse +results. To address these challenges, we present a novel hybrid point +representation to achieve accurate and continuous motion generation, e.g., +avoiding foot skating, and this representation can be transformed into an +easy-to-use representation, i.e., SMPL-X body mesh, for many applications. To +generate whole-body motion from speech, for facial motion, closely tied to the +audio signal, we introduce an encoder-decoder architecture to achieve +deterministic outcomes. However, for the body and hands, which have weaker +connections to the audio signal, we aim to generate diverse yet reasonable +motions. To boost diversity in motion generation, we propose a contrastive +motion learning method to encourage the model to produce more distinctive +representations. Specifically, we design a robust VQ-VAE to learn a quantized +motion codebook using our hybrid representation. Then, we regress the motion +representation from the audio signal by a translation model employing our +contrastive motion learning method. Experimental results validate the superior +performance and the correctness of our model. The project page is available for +research purposes at http://cic.tju.edu.cn/faculty/likun/projects/SpeechAct. + +
+
+ comment: Accepted by TVCG +
+
+
+
+
+ + ♻ SLIM: Sim-to-Real Legged Instructive Manipulation via Long-Horizon + Visuomotor Learning + + +
+ We present a low-cost legged mobile manipulation system that solves +long-horizon real-world tasks, trained by reinforcement learning purely in +simulation. This system is made possible by 1) a hierarchical design of a +high-level policy for visual-mobile manipulation following instructions and a +low-level policy for quadruped movement and limb control, 2) a progressive +exploration and learning approach that leverages privileged task decomposition +information to train the teacher policy for long-horizon tasks, which will +guide an imitation-based student policy for efficient training of the +high-level visuomotor policy, and 3) a suite of techniques for minimizing +sim-to-real gaps. + In contrast to previous approaches that use high-end equipment, our system +demonstrates effective performance with more accessible hardware - +specifically, a Unitree Go1 quadruped, a WidowX250S arm, and a single +wrist-mounted RGB camera - despite the increased challenges of sim-to-real +transfer. When fully trained in simulation, a single policy autonomously solves +long-horizon tasks such as search, move, grasp, and drop-into, achieving nearly +80% success. This performance is comparable to that of expert human +teleoperation on the same tasks but operates in a more efficient way, at 1.5 +times the speed of human expert. The sim-to-real transfer is fluid across +diverse indoor and outdoor scenes under varying lighting conditions. Finally, +we discuss the key techniques that enable the entire pipeline, including +efficient RL training and sim-to-real, to work effectively for legged mobile +manipulation, and present their ablation results. + +
+
+
+
+
+ + ♻ ☆ Serpent: Scalable and Efficient Image Restoration via Multi-scale + Structured State Space Models + + +
+ The landscape of computational building blocks of efficient image restoration +architectures is dominated by a combination of convolutional processing and +various attention mechanisms. However, convolutional filters, while efficient, +are inherently local and therefore struggle with modeling long-range +dependencies in images. In contrast, attention excels at capturing global +interactions between arbitrary image regions, but suffers from a quadratic cost +in image dimension. In this work, we propose Serpent, an efficient architecture +for high-resolution image restoration that combines recent advances in state +space models (SSMs) with multi-scale signal processing in its core +computational block. SSMs, originally introduced for sequence modeling, can +maintain a global receptive field with a favorable linear scaling in input +size. We propose a novel hierarchical architecture inspired by traditional +signal processing principles, that converts the input image into a collection +of sequences and processes them in a multi-scale fashion. Our experimental +results demonstrate that Serpent can achieve reconstruction quality on par with +state-of-the-art techniques, while requiring orders of magnitude less compute +(up to $150$ fold reduction in FLOPS) and a factor of up to $5\times$ less GPU +memory while maintaining a compact model size. The efficiency gains achieved by +Serpent are especially notable at high image resolutions. + +
+
+
+
+
+ + ♻ ☆ Volumetrically Consistent 3D Gaussian Rasterization + + +
+ Recently, 3D Gaussian Splatting (3DGS) has enabled photorealistic view +synthesis at high inference speeds. However, its splatting-based rendering +model makes several approximations to the rendering equation, reducing physical +accuracy. We show that splatting and its approximations are unnecessary, even +within a rasterizer; we instead volumetrically integrate 3D Gaussians directly +to compute the transmittance across them analytically. We use this analytic +transmittance to derive more physically-accurate alpha values than 3DGS, which +can directly be used within their framework. The result is a method that more +closely follows the volume rendering equation (similar to ray-tracing) while +enjoying the speed benefits of rasterization. Our method represents opaque +surfaces with higher accuracy and fewer points than 3DGS. This enables it to +outperform 3DGS for view synthesis (measured in SSIM and LPIPS). Being +volumetrically consistent also enables our method to work out of the box for +tomography. We match the state-of-the-art 3DGS-based tomography method with +fewer points. + +
+
+
+
+
+ + ♻ ☆ Aligning Visual Contrastive learning models via Preference Optimization + + +
+ Contrastive learning models have demonstrated impressive abilities to capture +semantic similarities by aligning representations in the embedding space. +However, their performance can be limited by the quality of the training data +and its inherent biases. While Reinforcement Learning from Human Feedback +(RLHF) and Direct Preference Optimization (DPO) have been applied to generative +models to align them with human preferences, their use in contrastive learning +has yet to be explored. This paper introduces a novel method for training +contrastive learning models using Preference Optimization (PO) to break down +complex concepts. Our method systematically aligns model behavior with desired +preferences, enhancing performance on the targeted task. In particular, we +focus on enhancing model robustness against typographic attacks, commonly seen +in contrastive models like CLIP. We further apply our method to disentangle +gender understanding and mitigate gender biases, offering a more nuanced +control over these sensitive attributes. Our experiments demonstrate that +models trained using PO outperform standard contrastive learning techniques +while retaining their ability to handle adversarial challenges and maintain +accuracy on other downstream tasks. This makes our method well-suited for tasks +requiring fairness, robustness, and alignment with specific preferences. We +evaluate our method on several vision-language tasks, tackling challenges such +as typographic attacks. Additionally, we explore the model's ability to +disentangle gender concepts and mitigate gender bias, showcasing the +versatility of our approach. + +
+
+
+
+
+ + ♻ ☆ LatentExplainer: Explaining Latent Representations in Deep Generative + Models with Multimodal Large Language Models + + +
+ Deep generative models like VAEs and diffusion models have advanced various +generation tasks by leveraging latent variables to learn data distributions and +generate high-quality samples. Despite the field of explainable AI making +strides in interpreting machine learning models, understanding latent variables +in generative models remains challenging. This paper introduces +\textit{LatentExplainer}, a framework for automatically generating semantically +meaningful explanations of latent variables in deep generative models. +\textit{LatentExplainer} tackles three main challenges: inferring the meaning +of latent variables, aligning explanations with inductive biases, and handling +varying degrees of explainability. Our approach perturbs latent variables, +interpreting changes in generated data, and uses multi-modal large language +models (MLLMs) to produce human-understandable explanations. We evaluate our +proposed method on several real-world and synthetic datasets, and the results +demonstrate superior performance in generating high-quality explanations for +latent variables. The results highlight the effectiveness of incorporating +inductive biases and uncertainty quantification, significantly enhancing model +interpretability. + +
+
+
+
+
+ + ♻ ☆ Unified 3D MRI Representations via Sequence-Invariant Contrastive + Learning + + +
+ Self-supervised deep learning has accelerated 2D natural image analysis but +remains difficult to translate into 3D MRI, where data are scarce and +pre-trained 2D backbones cannot capture volumetric context. We present a +sequence-invariant self-supervised framework leveraging quantitative MRI +(qMRI). By simulating multiple MRI contrasts from a single 3D qMRI scan and +enforcing consistent representations across these contrasts, we learn +anatomy-centric rather than sequence-specific features. This yields a robust 3D +encoder that performs strongly across varied tasks and protocols. Experiments +on healthy brain segmentation (IXI), stroke lesion segmentation (ARC), and MRI +denoising show significant gains over baseline SSL approaches, especially in +low-data settings (up to +8.3% Dice, +4.2 dB PSNR). Our model also generalises +effectively to unseen sites, demonstrating potential for more scalable and +clinically reliable volumetric analysis. All code and trained models are +publicly available. + +
+
+
+
+
+ + ♻ ☆ DCT-CryptoNets: Scaling Private Inference in the Frequency Domain ICLR 2025 + + +
+ The convergence of fully homomorphic encryption (FHE) and machine learning +offers unprecedented opportunities for private inference of sensitive data. FHE +enables computation directly on encrypted data, safeguarding the entire machine +learning pipeline, including data and model confidentiality. However, existing +FHE-based implementations for deep neural networks face significant challenges +in computational cost, latency, and scalability, limiting their practical +deployment. This paper introduces DCT-CryptoNets, a novel approach that +operates directly in the frequency-domain to reduce the burden of +computationally expensive non-linear activations and homomorphic bootstrap +operations during private inference. It does so by utilizing the discrete +cosine transform (DCT), commonly employed in JPEG encoding, which has inherent +compatibility with remote computing services where images are generally stored +and transmitted in this encoded format. DCT-CryptoNets demonstrates a +substantial latency reductions of up to 5.3$\times$ compared to prior work on +benchmark image classification tasks. Notably, it demonstrates inference on the +ImageNet dataset within 2.5 hours (down from 12.5 hours on equivalent 96-thread +compute resources). Furthermore, by learning perceptually salient low-frequency +information DCT-CryptoNets improves the reliability of encrypted predictions +compared to RGB-based networks by reducing error accumulating homomorphic +bootstrap operations. DCT-CryptoNets also demonstrates superior scalability to +RGB-based networks by further reducing computational cost as image size +increases. This study demonstrates a promising avenue for achieving efficient +and practical private inference of deep learning models on high resolution +images seen in real-world applications. + +
+
+ comment: ICLR 2025; 10 pages content, 5 pages appendix, 4 figures, 9 tables; + Code @ https://github.com/ar-roy/dct-cryptonets +
+
+
+
+
+ + ♻ ☆ Dequantization and Color Transfer with Diffusion Models + + +
+ We demonstrate an image dequantizing diffusion model that enables novel edits +on natural images. We propose operating on quantized images because they offer +easy abstraction for patch-based edits and palette transfer. In particular, we +show that color palettes can make the output of the diffusion model easier to +control and interpret. We first establish that existing image restoration +methods are not sufficient, such as JPEG noise reduction models. We then +demonstrate that our model can generate natural images that respect the color +palette the user asked for. For palette transfer, we propose a method based on +weighted bipartite matching. We then show that our model generates plausible +images even after extreme palette transfers, respecting user query. Our method +can optionally condition on the source texture in part or all of the image. In +doing so, we overcome a common problem in existing image colorization methods +that are unable to produce colors with a different luminance than the input. We +evaluate several possibilities for texture conditioning and their trade-offs, +including luminance, image gradients, and thresholded gradients, the latter of +which performed best in maintaining texture and color control simultaneously. +Our method can be usefully extended to another practical edit: recoloring +patches of an image while respecting the source texture. Our procedure is +supported by several qualitative and quantitative evaluations. + +
+
+ comment: WACV 2025 23 pages, 21 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Video-Guided Foley Sound Generation with Multimodal Controls + + +
+ Generating sound effects for videos often requires creating artistic sound +effects that diverge significantly from real-life sources and flexible control +in the sound design. To address this problem, we introduce MultiFoley, a model +designed for video-guided sound generation that supports multimodal +conditioning through text, audio, and video. Given a silent video and a text +prompt, MultiFoley allows users to create clean sounds (e.g., skateboard wheels +spinning without wind noise) or more whimsical sounds (e.g., making a lion's +roar sound like a cat's meow). MultiFoley also allows users to choose reference +audio from sound effects (SFX) libraries or partial videos for conditioning. A +key novelty of our model lies in its joint training on both internet video +datasets with low-quality audio and professional SFX recordings, enabling +high-quality, full-bandwidth (48kHz) audio generation. Through automated +evaluations and human studies, we demonstrate that MultiFoley successfully +generates synchronized high-quality sounds across varied conditional inputs and +outperforms existing methods. Please see our project page for video results: +https://ificl.github.io/MultiFoley/ + +
+
+ comment: Project site: https://ificl.github.io/MultiFoley/ +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 31 + +
+
+
+ + ☆ Multi-Agent Feedback Motion Planning using Probably Approximately + Correct Nonlinear Model Predictive Control + + +
+ For many tasks, multi-robot teams often provide greater efficiency, +robustness, and resiliency. However, multi-robot collaboration in real-world +scenarios poses a number of major challenges, especially when dynamic robots +must balance competing objectives like formation control and obstacle avoidance +in the presence of stochastic dynamics and sensor uncertainty. In this paper, +we propose a distributed, multi-agent receding-horizon feedback motion planning +approach using Probably Approximately Correct Nonlinear Model Predictive +Control (PAC-NMPC) that is able to reason about both model and measurement +uncertainty to achieve robust multi-agent formation control while navigating +cluttered obstacle fields and avoiding inter-robot collisions. Our approach +relies not only on the underlying PAC-NMPC algorithm but also on a terminal +cost-function derived from gyroscopic obstacle avoidance. Through numerical +simulation, we show that our distributed approach performs on par with a +centralized formulation, that it offers improved performance in the case of +significant measurement noise, and that it can scale to more complex dynamical +systems. + +
+
+ comment: 10 pages, 12 figures +
+
+
+
+
+ + ☆ Improving robot understanding using conversational AI: demonstration and + feasibility study + + +
+ Explanations constitute an important aspect of successful human robot +interactions and can enhance robot understanding. To improve the understanding +of the robot, we have developed four levels of explanation (LOE) based on two +questions: what needs to be explained, and why the robot has made a particular +decision. The understandable robot requires a communicative action when there +is disparity between the human s mental model of the robot and the robots state +of mind. This communicative action was generated by utilizing a conversational +AI platform to generate explanations. An adaptive dialog was implemented for +transition from one LOE to another. Here, we demonstrate the adaptive dialog in +a collaborative task with errors and provide results of a feasibility study +with users. + +
+
+ comment: 40th Anniversary, IEEE International Conference on Robotics and + Automation,2024 +
+
+
+
+
+ + ☆ Evaluating Efficiency and Engagement in Scripted and LLM-Enhanced + Human-Robot Interactions + + +
+ To achieve natural and intuitive interaction with people, HRI frameworks +combine a wide array of methods for human perception, intention communication, +human-aware navigation and collaborative action. In practice, when encountering +unpredictable behavior of people or unexpected states of the environment, these +frameworks may lack the ability to dynamically recognize such states, adapt and +recover to resume the interaction. Large Language Models (LLMs), owing to their +advanced reasoning capabilities and context retention, present a promising +solution for enhancing robot adaptability. This potential, however, may not +directly translate to improved interaction metrics. This paper considers a +representative interaction with an industrial robot involving approach, +instruction, and object manipulation, implemented in two conditions: (1) fully +scripted and (2) including LLM-enhanced responses. We use gaze tracking and +questionnaires to measure the participants' task efficiency, engagement, and +robot perception. The results indicate higher subjective ratings for the LLM +condition, but objective metrics show that the scripted condition performs +comparably, particularly in efficiency and focus during simple tasks. We also +note that the scripted condition may have an edge over LLM-enhanced responses +in terms of response latency and energy consumption, especially for trivial and +repetitive interactions. + +
+
+ comment: Accepted as a Late-Breaking Report to the 2025, 20th ACM/IEEE + International Conference on Human-Robot Interaction (HRI) +
+
+
+
+
+ + ☆ Towards autonomous photogrammetric forest inventory using a lightweight + under-canopy robotic drone + + +
+ Drones are increasingly used in forestry to capture high-resolution remote +sensing data. While operations above the forest canopy are already highly +automated, flying inside forests remains challenging, primarily relying on +manual piloting. Inside dense forests, reliance on the Global Navigation +Satellite System (GNSS) for localization is not feasible. Additionally, the +drone must autonomously adjust its flight path to avoid collisions. Recently, +advancements in robotics have enabled autonomous drone flights in GNSS-denied +obstacle-rich areas. In this article, a step towards autonomous forest data +collection is taken by building a prototype of a robotic under-canopy drone +utilizing state-of-the-art open-source methods and validating its performance +for data collection inside forests. The autonomous flight capability was +evaluated through multiple test flights in two boreal forest test sites. The +tree parameter estimation capability was studied by conducting diameter at +breast height (DBH) estimation using onboard stereo camera data and +photogrammetric methods. The prototype conducted flights in selected +challenging forest environments, and the experiments showed excellent +performance in forest reconstruction with a miniaturized stereoscopic +photogrammetric system. The stem detection algorithm managed to identify 79.31 +% of the stems. The DBH estimation had a root mean square error (RMSE) of 3.33 +cm (12.79 %) and a bias of 1.01 cm (3.87 %) across all trees. For trees with a +DBH less than 30 cm, the RMSE was 1.16 cm (5.74 %), and the bias was 0.13 cm +(0.64 %). When considering the overall performance in terms of DBH accuracy, +autonomy, and forest complexity, the proposed approach was superior compared to +methods proposed in the scientific literature. Results provided valuable +insights into autonomous forest reconstruction using drones, and several +further development topics were proposed. + +
+
+ comment: 35 pages, 13 Figures +
+
+
+
+
+ + ☆ Low-Cost 3D printed, Biocompatible Ionic Polymer Membranes for Soft + Actuators + + +
+ Ionic polymer actuators, in essence, consist of ion exchange polymers +sandwiched between layers of electrodes. They have recently gained recognition +as promising candidates for soft actuators due to their lightweight nature, +noise-free operation, and low-driving voltages. However, the materials +traditionally utilized to develop them are often not human/environmentally +friendly. Thus, to address this issue, researchers have been focusing on +developing biocompatible versions of this actuator. Despite this, such +actuators still face challenges in achieving high performance, in payload +capacity, bending capabilities, and response time. In this paper, we present a +biocompatible ionic polymer actuator whose membrane is fully 3D printed +utilizing a direct ink writing method. The structure of the printed membranes +consists of biodegradable ionic fluid encapsulated within layers of activated +carbon polymers. From the microscopic observations of its structure, we +confirmed that the ionic polymer is well encapsulated. The actuators can +achieve a bending performance of up to 124$^\circ$ (curvature of 0.82 +$\text{cm}^{-1}$), which, to our knowledge, is the highest curvature attained +by any bending ionic polymer actuator to date. It can operate comfortably up to +a 2 Hz driving frequency and can achieve blocked forces of up to 0.76 mN. Our +results showcase a promising, high-performing biocompatible ionic polymer +actuator, whose membrane can be easily manufactured in a single step using a +standard FDM 3D printer. This approach paves the way for creating customized +designs for functional soft robotic applications, including human-interactive +devices, in the near future. + +
+
+ comment: 6 pages, 8 figures, Accepted in IEEE International Conference on Soft + Robotics 2025 (Robosoft) +
+
+
+
+
+ + ☆ Learning to Hop for a Single-Legged Robot with Parallel Mechanism + + +
+ This work presents the application of reinforcement learning to improve the +performance of a highly dynamic hopping system with a parallel mechanism. +Unlike serial mechanisms, parallel mechanisms can not be accurately simulated +due to the complexity of their kinematic constraints and closed-loop +structures. Besides, learning to hop suffers from prolonged aerial phase and +the sparse nature of the rewards. To address them, we propose a learning +framework to encode long-history feedback to account for the under-actuation +brought by the prolonged aerial phase. In the proposed framework, we also +introduce a simplified serial configuration for the parallel design to avoid +directly simulating parallel structure during the training. A torque-level +conversion is designed to deal with the parallel-serial conversion to handle +the sim-to-real issue. Simulation and hardware experiments have been conducted +to validate this framework. + +
+
+
+
+
+ + ☆ Navigating Robot Swarm Through a Virtual Tube with Flow-Adaptive + Distribution Control + + +
+ With the rapid development of robot swarm technology and its diverse +applications, navigating robot swarms through complex environments has emerged +as a critical research direction. To ensure safe navigation and avoid potential +collisions with obstacles, the concept of virtual tubes has been introduced to +define safe and navigable regions. However, current control methods in virtual +tubes face the congestion issues, particularly in narrow virtual tubes with low +throughput. To address these challenges, we first originally introduce the +concepts of virtual tube area and flow capacity, and develop an new evolution +model for the spatial density function. Next, we propose a novel control method +that combines a modified artificial potential field (APF) for swarm navigation +and density feedback control for distribution regulation, under which a +saturated velocity command is designed. Then, we generate a global velocity +field that not only ensures collision-free navigation through the virtual tube, +but also achieves locally input-to-state stability (LISS) for density tracking +errors, both of which are rigorously proven. Finally, numerical simulations and +realistic applications validate the effectiveness and advantages of the +proposed method in managing robot swarms within narrow virtual tubes. + +
+
+
+
+
+ + ☆ Nocturnal eye inspired liquid to gas phase change soft actuator with + Laser-Induced-Graphene: enhanced environmental light harvesting and + photothermal conversion + + +
+ Robotic systems' mobility is constrained by power sources and wiring. While +pneumatic actuators remain tethered to air supplies, we developed a new +actuator utilizing light energy. Inspired by nocturnal animals' eyes, we +designed a bilayer soft actuator incorporating Laser-Induced Graphene (LIG) on +the inner surface of a silicone layer. This design maintains silicone's +transparency and flexibility while achieving 54% faster response time compared +to conventional actuators through enhanced photothermal conversion. + +
+
+ comment: 23pages, 8 figures, journal paper +
+
+
+
+
+ + ☆ DynoSAM: Open-Source Smoothing and Mapping Framework for Dynamic SLAM + + +
+ Traditional Visual Simultaneous Localization and Mapping (vSLAM) systems +focus solely on static scene structures, overlooking dynamic elements in the +environment. Although effective for accurate visual odometry in complex +scenarios, these methods discard crucial information about moving objects. By +incorporating this information into a Dynamic SLAM framework, the motion of +dynamic entities can be estimated, enhancing navigation whilst ensuring +accurate localization. However, the fundamental formulation of Dynamic SLAM +remains an open challenge, with no consensus on the optimal approach for +accurate motion estimation within a SLAM pipeline. Therefore, we developed +DynoSAM, an open-source framework for Dynamic SLAM that enables the efficient +implementation, testing, and comparison of various Dynamic SLAM optimization +formulations. DynoSAM integrates static and dynamic measurements into a unified +optimization problem solved using factor graphs, simultaneously estimating +camera poses, static scene, object motion or poses, and object structures. We +evaluate DynoSAM across diverse simulated and real-world datasets, achieving +state-of-the-art motion estimation in indoor and outdoor environments, with +substantial improvements over existing systems. Additionally, we demonstrate +DynoSAM utility in downstream applications, including 3D reconstruction of +dynamic scenes and trajectory prediction, thereby showcasing potential for +advancing dynamic object-aware SLAM systems. DynoSAM is open-sourced at +https://github.com/ACFR-RPG/DynOSAM. + +
+
+ comment: 20 pages, 10 figures. Submitted to T-RO Visual SLAM SI 2025 +
+
+
+
+
+ + ☆ Connection-Coordination Rapport (CCR) Scale: A Dual-Factor Scale to + Measure Human-Robot Rapport + + +
+ Robots, particularly in service and companionship roles, must develop +positive relationships with people they interact with regularly to be +successful. These positive human-robot relationships can be characterized as +establishing "rapport," which indicates mutual understanding and interpersonal +connection that form the groundwork for successful long-term human-robot +interaction. However, the human-robot interaction research literature lacks +scale instruments to assess human-robot rapport in a variety of situations. In +this work, we developed the 18-item Connection-Coordination Rapport (CCR) Scale +to measure human-robot rapport. We first ran Study 1 (N = 288) where online +participants rated videos of human-robot interactions using a set of candidate +items. Our Study 1 results showed the discovery of two factors in our scale, +which we named "Connection" and "Coordination." We then evaluated this scale by +running Study 2 (N = 201) where online participants rated a new set of +human-robot interaction videos with our scale and an existing rapport scale +from virtual agents research for comparison. We also validated our scale by +replicating a prior in-person human-robot interaction study, Study 3 (N = 44), +and found that rapport is rated significantly greater when participants +interacted with a responsive robot (responsive condition) as opposed to an +unresponsive robot (unresponsive condition). Results from these studies +demonstrate high reliability and validity for the CCR scale, which can be used +to measure rapport in both first-person and third-person perspectives. We +encourage the adoption of this scale in future studies to measure rapport in a +variety of human-robot interactions. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ Automating High Quality RT Planning at Scale + + +
+ Radiotherapy (RT) planning is complex, subjective, and time-intensive. +Advances in artificial intelligence (AI) promise to improve its precision, +efficiency, and consistency, but progress is often limited by the scarcity of +large, standardized datasets. To address this, we introduce the Automated +Iterative RT Planning (AIRTP) system, a scalable solution for generating +high-quality treatment plans. This scalable solution is designed to generate +substantial volumes of consistently high-quality treatment plans, overcoming a +key obstacle in the advancement of AI-driven RT planning. Our AIRTP pipeline +adheres to clinical guidelines and automates essential steps, including +organ-at-risk (OAR) contouring, helper structure creation, beam setup, +optimization, and plan quality improvement, using AI integrated with RT +planning software like Eclipse of Varian. Furthermore, a novel approach for +determining optimization parameters to reproduce 3D dose distributions, i.e. a +method to convert dose predictions to deliverable treatment plans constrained +by machine limitations. A comparative analysis of plan quality reveals that our +automated pipeline produces treatment plans of quality comparable to those +generated manually, which traditionally require several hours of labor per +plan. Committed to public research, the first data release of our AIRTP +pipeline includes nine cohorts covering head-and-neck and lung cancer sites to +support an AAPM 2025 challenge. This data set features more than 10 times the +number of plans compared to the largest existing well-curated public data set +to our best knowledge. +Repo:{https://github.com/RiqiangGao/GDP-HMM_AAPMChallenge} + +
+
+ comment: Related to GDP-HMM grand challenge +
+
+
+
+
+ + ☆ Interaction Dataset of Autonomous Vehicles with Traffic Lights and Signs + + +
+ This paper presents the development of a comprehensive dataset capturing +interactions between Autonomous Vehicles (AVs) and traffic control devices, +specifically traffic lights and stop signs. Derived from the Waymo Motion +dataset, our work addresses a critical gap in the existing literature by +providing real-world trajectory data on how AVs navigate these traffic control +devices. We propose a methodology for identifying and extracting relevant +interaction trajectory data from the Waymo Motion dataset, incorporating over +37,000 instances with traffic lights and 44,000 with stop signs. Our +methodology includes defining rules to identify various interaction types, +extracting trajectory data, and applying a wavelet-based denoising method to +smooth the acceleration and speed profiles and eliminate anomalous values, +thereby enhancing the trajectory quality. Quality assessment metrics indicate +that trajectories obtained in this study have anomaly proportions in +acceleration and jerk profiles reduced to near-zero levels across all +interaction categories. By making this dataset publicly available, we aim to +address the current gap in datasets containing AV interaction behaviors with +traffic lights and signs. Based on the organized and published dataset, we can +gain a more in-depth understanding of AVs' behavior when interacting with +traffic lights and signs. This will facilitate research on AV integration into +existing transportation infrastructures and networks, supporting the +development of more accurate behavioral models and simulation tools. + +
+
+
+
+
+ + ☆ ELEGNT: Expressive and Functional Movement Design for + Non-anthropomorphic Robot + + +
+ Nonverbal behaviors such as posture, gestures, and gaze are essential for +conveying internal states, both consciously and unconsciously, in human +interaction. For robots to interact more naturally with humans, robot movement +design should likewise integrate expressive qualities, such as intention, +attention, and emotions, alongside traditional functional considerations like +task fulfillment and time efficiency. In this paper, we present the design and +prototyping of a lamp-like robot that explores the interplay between functional +and expressive objectives in movement design. Using a research-through-design +methodology, we document the hardware design process, define expressive +movement primitives, and outline a set of interaction scenario storyboards. We +propose a framework that incorporates both functional and expressive utilities +during movement generation, and implement the robot behavior sequences in +different function- and social- oriented tasks. Through a user study comparing +expression-driven versus function-driven movements across six task scenarios, +our findings indicate that expression-driven movements significantly enhance +user engagement and perceived robot qualities. This effect is especially +pronounced in social-oriented tasks. + +
+
+ comment: 13 pages, manuscript under review +
+
+
+
+
+ + ☆ TOFFE -- Temporally-binned Object Flow from Events for High-speed and + Energy-Efficient Object Detection and Tracking + + +
+ Object detection and tracking is an essential perception task for enabling +fully autonomous navigation in robotic systems. Edge robot systems such as +small drones need to execute complex maneuvers at high-speeds with limited +resources, which places strict constraints on the underlying algorithms and +hardware. Traditionally, frame-based cameras are used for vision-based +perception due to their rich spatial information and simplified synchronous +sensing capabilities. However, obtaining detailed information across frames +incurs high energy consumption and may not even be required. In addition, their +low temporal resolution renders them ineffective in high-speed motion +scenarios. Event-based cameras offer a biologically-inspired solution to this +by capturing only changes in intensity levels at exceptionally high temporal +resolution and low power consumption, making them ideal for high-speed motion +scenarios. However, their asynchronous and sparse outputs are not natively +suitable with conventional deep learning methods. In this work, we propose +TOFFE, a lightweight hybrid framework for performing event-based object motion +estimation (including pose, direction, and speed estimation), referred to as +Object Flow. TOFFE integrates bio-inspired Spiking Neural Networks (SNNs) and +conventional Analog Neural Networks (ANNs), to efficiently process events at +high temporal resolutions while being simple to train. Additionally, we present +a novel event-based synthetic dataset involving high-speed object motion to +train TOFFE. Our experimental results show that TOFFE achieves 5.7x/8.3x +reduction in energy consumption and 4.6x/5.8x reduction in latency on edge +GPU(Jetson TX2)/hybrid hardware(Loihi-2 and Jetson TX2), compared to previous +event-based object detection baselines. + +
+
+ comment: 8 pages, 6 figures, 4 tables +
+
+
+
+
+ + ☆ LiCAR: pseudo-RGB LiDAR image for CAR segmentation + + +
+ With the advancement of computing resources, an increasing number of Neural +Networks (NNs) are appearing for image detection and segmentation appear. +However, these methods usually accept as input a RGB 2D image. On the other +side, Light Detection And Ranging (LiDAR) sensors with many layers provide +images that are similar to those obtained from a traditional low resolution RGB +camera. Following this principle, a new dataset for segmenting cars in +pseudo-RGB images has been generated. This dataset combines the information +given by the LiDAR sensor into a Spherical Range Image (SRI), concretely the +reflectivity, near infrared and signal intensity 2D images. These images are +then fed into instance segmentation NNs. These NNs segment the cars that appear +in these images, having as result a Bounding Box (BB) and mask precision of 88% +and 81.5% respectively with You Only Look Once (YOLO)-v8 large. By using this +segmentation NN, some trackers have been applied so as to follow each car +segmented instance along a video feed, having great performance in real world +experiments. + +
+
+ comment: This is a preprint version of the work accepted at 5th International + Conference on Robotics, Computer Vision and Intelligent Systems (ROBOVIS + 2025) +
+
+
+
+
+ + ☆ A causal learning approach to in-orbit inertial parameter estimation for + multi-payload deployers + + +
+ This paper discusses an approach to inertial parameter estimation for the +case of cargo carrying spacecraft that is based on causal learning, i.e. +learning from the responses of the spacecraft, under actuation. Different +spacecraft configurations (inertial parameter sets) are simulated under +different actuation profiles, in order to produce an optimised time-series +clustering classifier that can be used to distinguish between them. The +actuation is comprised of finite sequences of constant inputs that are applied +in order, based on typical actuators available. By learning from the system's +responses across multiple input sequences, and then applying measures of +time-series similarity and F1-score, an optimal actuation sequence can be +chosen either for one specific system configuration or for the overall set of +possible configurations. This allows for both estimation of the inertial +parameter set without any prior knowledge of state, as well as validation of +transitions between different configurations after a deployment event. The +optimisation of the actuation sequence is handled by a reinforcement learning +model that uses the proximal policy optimisation (PPO) algorithm, by repeatedly +trying different sequences and evaluating the impact on classifier performance +according to a multi-objective metric. + +
+
+ comment: 10 pages, 18 figures, 1 table. Presented in 75th International + Astronautical Congress (IAC), Milan, Italy, 14-18 October 2024 +
+
+
+
+
+ + ♻ ☆ FoundationStereo: Zero-Shot Stereo Matching + + +
+ Tremendous progress has been made in deep stereo matching to excel on +benchmark datasets through per-domain fine-tuning. However, achieving strong +zero-shot generalization - a hallmark of foundation models in other computer +vision tasks - remains challenging for stereo matching. We introduce +FoundationStereo, a foundation model for stereo depth estimation designed to +achieve strong zero-shot generalization. To this end, we first construct a +large-scale (1M stereo pairs) synthetic training dataset featuring large +diversity and high photorealism, followed by an automatic self-curation +pipeline to remove ambiguous samples. We then design a number of network +architecture components to enhance scalability, including a side-tuning feature +backbone that adapts rich monocular priors from vision foundation models to +mitigate the sim-to-real gap, and long-range context reasoning for effective +cost volume filtering. Together, these components lead to strong robustness and +accuracy across domains, establishing a new standard in zero-shot stereo depth +estimation. Project page: https://nvlabs.github.io/FoundationStereo/ + +
+
+
+
+
+ + ♻ ☆ A Search-to-Control Reinforcement Learning Based Framework for Quadrotor + Local Planning in Dense Environments + + +
+ Agile flight in complex environments poses significant challenges to current +motion planning methods, as they often fail to fully leverage the quadrotor's +dynamic potential, leading to performance failures and reduced efficiency +during aggressive maneuvers. Existing approaches frequently decouple trajectory +optimization from control generation and neglect the dynamics, further limiting +their ability to generate aggressive and feasible motions. To address these +challenges, we introduce an enhanced Search-to-Control planning framework that +integrates visibility path searching with reinforcement learning (RL) control +generation, directly accounting for dynamics and bridging the gap between +planning and control. Our method first extracts control points from +collision-free paths using a proposed heuristic search, which are then refined +by an RL policy to generate low-level control commands for the quadrotor's +controller, utilizing reduced-dimensional obstacle observations for efficient +inference with lightweight neural networks. We validate the framework through +simulations and real-world experiments, demonstrating improved time efficiency +and dynamic maneuverability compared to existing methods, while confirming its +robustness and applicability. To support further research, We will release our +implementation as an open-source package. + +
+
+
+
+
+ + ♻ ☆ RadaRays: Real-time Simulation of Rotating FMCW Radar for Mobile + Robotics via Hardware-accelerated Ray Tracing + + +
+ RadaRays allows for the accurate modeling and simulation of rotating FMCW +radar sensors in complex environments, including the simulation of reflection, +refraction, and scattering of radar waves. Our software is able to handle large +numbers of objects and materials in real-time, making it suitable for use in a +variety of mobile robotics applications. We demonstrate the effectiveness of +RadaRays through a series of experiments and show that it can more accurately +reproduce the behavior of FMCW radar sensors in a variety of environments, +compared to the ray casting-based lidar-like simulations that are commonly used +in simulators for autonomous driving such as CARLA. Our experiments +additionally serve as a valuable reference point for researchers to evaluate +their own radar simulations. By using RadaRays, developers can significantly +reduce the time and cost associated with prototyping and testing FMCW +radar-based algorithms. We also provide a Gazebo plugin that makes our work +accessible to the mobile robotics community. + +
+
+
+
+
+ + ♻ ☆ Sampling-based Model Predictive Control Leveraging Parallelizable + Physics Simulations RA-L + + +
+ We present a method for sampling-based model predictive control that makes +use of a generic physics simulator as the dynamical model. In particular, we +propose a Model Predictive Path Integral controller (MPPI), that uses the +GPU-parallelizable IsaacGym simulator to compute the forward dynamics of a +problem. By doing so, we eliminate the need for explicit encoding of robot +dynamics and contacts with objects for MPPI. Since no explicit dynamic modeling +is required, our method is easily extendable to different objects and robots +and allows one to solve complex navigation and contact-rich tasks. We +demonstrate the effectiveness of this method in several simulated and +real-world settings, among which mobile navigation with collision avoidance, +non-prehensile manipulation, and whole-body control for high-dimensional +configuration spaces. This method is a powerful and accessible open-source tool +to solve a large variety of contact-rich motion planning tasks. + +
+
+ comment: Accepted for RA-L. Code and videos available at + https://autonomousrobots.nl/paper_websites/isaac-mppi +
+
+
+
+
+ + ♻ ☆ Concurrent-Learning Based Relative Localization in Shape Formation of + Robot Swarms (Extended version) + + +
+ In this paper, we address the shape formation problem for massive robot +swarms in environments where external localization systems are unavailable. +Achieving this task effectively with solely onboard measurements is still +scarcely explored and faces some practical challenges. To solve this +challenging problem, we propose the following novel results. Firstly, to +estimate the relative positions among neighboring robots, a concurrent-learning +based estimator is proposed. It relaxes the persistent excitation condition +required in the classical ones such as least-square estimator. Secondly, we +introduce a finite-time agreement protocol to determine the shape location. +This is achieved by estimating the relative position between each robot and a +randomly assigned seed robot. The initial position of the seed one marks the +shape location. Thirdly, based on the theoretical results of the relative +localization, a novel behavior-based control strategy is devised. This strategy +not only enables adaptive shape formation of large group of robots but also +enhances the observability of inter-robot relative localization. Numerical +simulation results are provided to verify the performance of our proposed +strategy compared to the state-of-the-art ones. Additionally, outdoor +experiments on real robots further demonstrate the practical effectiveness and +robustness of our methods. + +
+
+
+
+
+ + ♻ ☆ Multi-Agent Consensus Seeking via Large Language Models + + +
+ Multi-agent systems driven by large language models (LLMs) have shown +promising abilities for solving complex tasks in a collaborative manner. This +work considers a fundamental problem in multi-agent collaboration: consensus +seeking. When multiple agents work together, we are interested in how they can +reach a consensus through inter-agent negotiation. To that end, this work +studies a consensus-seeking task where the state of each agent is a numerical +value and they negotiate with each other to reach a consensus value. It is +revealed that when not explicitly directed on which strategy should be adopted, +the LLM-driven agents primarily use the average strategy for consensus seeking +although they may occasionally use some other strategies. Moreover, this work +analyzes the impact of the agent number, agent personality, and network +topology on the negotiation process. The findings reported in this work can +potentially lay the foundations for understanding the behaviors of LLM-driven +multi-agent systems for solving more complex tasks. Furthermore, LLM-driven +consensus seeking is applied to a multi-robot aggregation task. This +application demonstrates the potential of LLM-driven agents to achieve +zero-shot autonomous planning for multi-robot collaboration tasks. Project +website: windylab.github.io/ConsensusLLM/. + +
+
+
+
+
+ + ♻ ☆ AirPilot: Interpretable PPO-based DRL Auto-Tuned Nonlinear PID Drone + Controller for Robust Autonomous Flights + + +
+ Navigation precision, speed and stability are crucial for safe Unmanned +Aerial Vehicle (UAV) flight maneuvers and effective flight mission executions +in dynamic environments. Different flight missions may have varying objectives, +such as minimizing energy consumption, achieving precise positioning, or +maximizing speed. A controller that can adapt to different objectives on the +fly is highly valuable. Proportional Integral Derivative (PID) controllers are +one of the most popular and widely used control algorithms for drones and other +control systems, but their linear control algorithm fails to capture the +nonlinear nature of the dynamic wind conditions and complex drone system. +Manually tuning the PID gains for various missions can be time-consuming and +requires significant expertise. This paper aims to revolutionize drone flight +control by presenting the AirPilot, a nonlinear Deep Reinforcement Learning +(DRL) - enhanced Proportional Integral Derivative (PID) drone controller using +Proximal Policy Optimization (PPO). AirPilot controller combines the simplicity +and effectiveness of traditional PID control with the adaptability, learning +capability, and optimization potential of DRL. This makes it better suited for +modern drone applications where the environment is dynamic, and +mission-specific performance demands are high. We employed a COEX Clover +autonomous drone for training the DRL agent within the simulator and +implemented it in a real-world lab setting, which marks a significant milestone +as one of the first attempts to apply a DRL-based flight controller on an +actual drone. Airpilot is capable of reducing the navigation error of the +default PX4 PID position controller by 90%, improving effective navigation +speed of a fine-tuned PID controller by 21%, reducing settling time and +overshoot by 17% and 16% respectively. + +
+
+ comment: 9 pages, 20 figures +
+
+
+
+
+ + ♻ ☆ Complementarity-Free Multi-Contact Modeling and Optimization for + Dexterous Manipulation + + +
+ A significant barrier preventing model-based methods from achieving real-time +and versatile dexterous robotic manipulation is the inherent complexity of +multi-contact dynamics. Traditionally formulated as complementarity models, +multi-contact dynamics introduces non-smoothness and combinatorial complexity, +complicating contact-rich planning and optimization. In this paper, we +circumvent these challenges by introducing a lightweight yet capable +multi-contact model. Our new model, derived from the duality of +optimization-based contact models, dispenses with the complementarity +constructs entirely, providing computational advantages such as closed-form +time stepping, differentiability, automatic satisfaction with Coulomb friction +law, and minimal hyperparameter tuning. We demonstrate the effectiveness and +efficiency of the model for planning and control in a range of challenging +dexterous manipulation tasks, including fingertip 3D in-air manipulation, +TriFinger in-hand manipulation, and Allegro hand on-palm reorientation, all +performed with diverse objects. Our method consistently achieves +state-of-the-art results: (I) a 96.5% average success rate across all objects +and tasks, (II) high manipulation accuracy with an average reorientation error +of 11{\deg} and position error of 7.8mm, and (III) contact-implicit model +predictive control running at 50-100 Hz for all objects and tasks. These +results are achieved with minimal hyperparameter tuning. + +
+
+ comment: Video demo: https://youtu.be/NsL4hbSXvFg +
+
+
+
+
+ + ♻ ☆ Optimal Spatial-Temporal Triangulation for Bearing-Only Cooperative + Motion Estimation + + +
+ Vision-based cooperative motion estimation is an important problem for many +multi-robot systems such as cooperative aerial target pursuit. This problem can +be formulated as bearing-only cooperative motion estimation, where the visual +measurement is modeled as a bearing vector pointing from the camera to the +target. The conventional approaches for bearing-only cooperative estimation are +mainly based on the framework distributed Kalman filtering (DKF). In this +paper, we propose a new optimal bearing-only cooperative estimation algorithm, +named spatial-temporal triangulation, based on the method of distributed +recursive least squares, which provides a more flexible framework for designing +distributed estimators than DKF. The design of the algorithm fully incorporates +all the available information and the specific triangulation geometric +constraint. As a result, the algorithm has superior estimation performance than +the state-of-the-art DKF algorithms in terms of both accuracy and convergence +speed as verified by numerical simulation. We rigorously prove the exponential +convergence of the proposed algorithm. Moreover, to verify the effectiveness of +the proposed algorithm under practical challenging conditions, we develop a +vision-based cooperative aerial target pursuit system, which is the first of +such fully autonomous systems so far to the best of our knowledge. + +
+
+
+
+
+ + ♻ ☆ Tightly-Coupled LiDAR-IMU-Wheel Odometry with an Online Neural Kinematic + Model Learning via Factor Graph Optimization + + +
+ Environments lacking geometric features (e.g., tunnels and long straight +corridors) are challenging for LiDAR-based odometry algorithms because LiDAR +point clouds degenerate in such environments. For wheeled robots, a wheel +kinematic model (i.e., wheel odometry) can improve the reliability of the +odometry estimation. However, the kinematic model suffers from complex motions +(e.g., wheel slippage, lateral movement) in the case of skid-steering robots +particularly because this robot model rotates by skidding its wheels. +Furthermore, these errors change nonlinearly when the wheel slippage is large +(e.g., drifting) and are subject to terrain-dependent parameters. To +simultaneously tackle point cloud degeneration and the kinematic model errors, +we developed a LiDAR-IMU-wheel odometry algorithm incorporating online training +of a neural network that learns the kinematic model of wheeled robots with +nonlinearity. We propose to train the neural network online on a factor graph +along with robot states, allowing the learning-based kinematic model to adapt +to the current terrain condition. The proposed method jointly solves online +training of the neural network and LiDARIMUwheel odometry on a unified factor +graph to retain the consistency of all those constraints. Through experiments, +we first verified that the proposed network adapted to a changing environment, +resulting in an accurate odometry estimation across different environments. We +then confirmed that the proposed odometry estimation algorithm was robust +against point cloud degeneration and nonlinearity (e.g., large wheel slippage +by drifting) of the kinematic model. + +
+
+ comment: https://youtu.be/CvRVhdda7Cw +
+
+
+
+
+ + ♻ ☆ FLAME: Learning to Navigate with Multimodal LLM in Urban Environments + + +
+ Large Language Models (LLMs) have demonstrated potential in +Vision-and-Language Navigation (VLN) tasks, yet current applications face +challenges. While LLMs excel in general conversation scenarios, they struggle +with specialized navigation tasks, yielding suboptimal performance compared to +specialized VLN models. We introduce FLAME (FLAMingo-Architected Embodied +Agent), a novel Multimodal LLM-based agent and architecture designed for urban +VLN tasks that efficiently handles multiple observations. Our approach +implements a three-phase tuning technique for effective adaptation to +navigation tasks, including single perception tuning for street view +description, multiple perception tuning for route summarization, and end-to-end +training on VLN datasets. The augmented datasets are synthesized automatically. +Experimental results demonstrate FLAME's superiority over existing methods, +surpassing state-of-the-art methods by a 7.3% increase in task completion on +Touchdown dataset. This work showcases the potential of Multimodal LLMs (MLLMs) +in complex navigation tasks, representing an advancement towards applications +of MLLMs in the field of embodied intelligence. + +
+
+ comment: Accepted to AAAI 2025 (Oral) +
+
+
+
+
+ + ♻ ☆ DCPI-Depth: Explicitly Infusing Dense Correspondence Prior to + Unsupervised Monocular Depth Estimation + + +
+ There has been a recent surge of interest in learning to perceive depth from +monocular videos in an unsupervised fashion. A key challenge in this field is +achieving robust and accurate depth estimation in challenging scenarios, +particularly in regions with weak textures or where dynamic objects are +present. This study makes three major contributions by delving deeply into +dense correspondence priors to provide existing frameworks with explicit +geometric constraints. The first novelty is a contextual-geometric depth +consistency loss, which employs depth maps triangulated from dense +correspondences based on estimated ego-motion to guide the learning of depth +perception from contextual information, since explicitly triangulated depth +maps capture accurate relative distances among pixels. The second novelty +arises from the observation that there exists an explicit, deducible +relationship between optical flow divergence and depth gradient. A differential +property correlation loss is, therefore, designed to refine depth estimation +with a specific emphasis on local variations. The third novelty is a +bidirectional stream co-adjustment strategy that enhances the interaction +between rigid and optical flows, encouraging the former towards more accurate +correspondence and making the latter more adaptable across various scenarios +under the static scene hypotheses. DCPI-Depth, a framework that incorporates +all these innovative components and couples two bidirectional and collaborative +streams, achieves state-of-the-art performance and generalizability across +multiple public datasets, outperforming all existing prior arts. Specifically, +it demonstrates accurate depth estimation in texture-less and dynamic regions, +and shows more reasonable smoothness. Our source code will be publicly +available at mias.group/DCPI-Depth upon publication. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Embodied-RAG: General Non-parametric Embodied Memory for Retrieval and + Generation + + +
+ There is no limit to how much a robot might explore and learn, but all of +that knowledge needs to be searchable and actionable. Within language research, +retrieval augmented generation (RAG) has become the workhorse of large-scale +non-parametric knowledge; however, existing techniques do not directly transfer +to the embodied domain, which is multimodal, where data is highly correlated, +and perception requires abstraction. To address these challenges, we introduce +Embodied-RAG, a framework that enhances the foundational model of an embodied +agent with a non-parametric memory system capable of autonomously constructing +hierarchical knowledge for both navigation and language generation. +Embodied-RAG handles a full range of spatial and semantic resolutions across +diverse environments and query types, whether for a specific object or a +holistic description of ambiance. At its core, Embodied-RAG's memory is +structured as a semantic forest, storing language descriptions at varying +levels of detail. This hierarchical organization allows the system to +efficiently generate context-sensitive outputs across different robotic +platforms. We demonstrate that Embodied-RAG effectively bridges RAG to the +robotics domain, successfully handling over 250 explanation and navigation +queries across kilometer-level environments, highlighting its promise as a +general-purpose non-parametric system for embodied agents. + +
+
+ comment: Web: https://quanting-xie.github.io/Embodied-RAG-web/ +
+
+
+
+
+ + ♻ ☆ LatentBKI: Open-Dictionary Continuous Mapping in Visual-Language Latent + Spaces with Quantifiable Uncertainty + + +
+ This paper introduces a novel probabilistic mapping algorithm, LatentBKI, +which enables open-vocabulary mapping with quantifiable uncertainty. +Traditionally, semantic mapping algorithms focus on a fixed set of semantic +categories which limits their applicability for complex robotic tasks. +Vision-Language (VL) models have recently emerged as a technique to jointly +model language and visual features in a latent space, enabling semantic +recognition beyond a predefined, fixed set of semantic classes. LatentBKI +recurrently incorporates neural embeddings from VL models into a voxel map with +quantifiable uncertainty, leveraging the spatial correlations of nearby +observations through Bayesian Kernel Inference (BKI). LatentBKI is evaluated +against similar explicit semantic mapping and VL mapping frameworks on the +popular Matterport3D and Semantic KITTI datasets, demonstrating that LatentBKI +maintains the probabilistic benefits of continuous mapping with the additional +benefit of open-dictionary queries. Real-world experiments demonstrate +applicability to challenging indoor environments. + +
+
+
+
+
+ + ♻ ☆ Tightly Coupled SLAM with Imprecise Architectural Plans + + +
+ Robots navigating indoor environments often have access to architectural +plans, which can serve as prior knowledge to enhance their localization and +mapping capabilities. While some SLAM algorithms leverage these plans for +global localization in real-world environments, they typically overlook a +critical challenge: the "as-planned" architectural designs frequently deviate +from the "as-built" real-world environments. To address this gap, we present a +novel algorithm that tightly couples LIDAR-based simultaneous localization and +mapping with architectural plans under the presence of deviations. Our method +utilizes a multi-layered semantic representation to not only localize the +robot, but also to estimate global alignment and structural deviations between +"as-planned" and as-built environments in real-time. To validate our approach, +we performed experiments in simulated and real datasets demonstrating +robustness to structural deviations up to 35 cm and 15 degrees. On average, our +method achieves 43% less localization error than baselines in simulated +environments, while in real environments, the as-built 3D maps show 7% lower +average alignment error + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 131 + +
+
+
+ + ☆ Towards Affordance-Aware Articulation Synthesis for Rigged Objects + + +
+ Rigged objects are commonly used in artist pipelines, as they can flexibly +adapt to different scenes and postures. However, articulating the rigs into +realistic affordance-aware postures (e.g., following the context, respecting +the physics and the personalities of the object) remains time-consuming and +heavily relies on human labor from experienced artists. In this paper, we +tackle the novel problem and design A3Syn. With a given context, such as the +environment mesh and a text prompt of the desired posture, A3Syn synthesizes +articulation parameters for arbitrary and open-domain rigged objects obtained +from the Internet. The task is incredibly challenging due to the lack of +training data, and we do not make any topological assumptions about the +open-domain rigs. We propose using 2D inpainting diffusion model and several +control techniques to synthesize in-context affordance information. Then, we +develop an efficient bone correspondence alignment using a combination of +differentiable rendering and semantic correspondence. A3Syn has stable +convergence, completes in minutes, and synthesizes plausible affordance on +different combinations of in-the-wild object rigs and scenes. + +
+
+ comment: Project page: https://chuyu.org/research/a3syn +
+
+
+
+
+ + ☆ Learning segmentation from point trajectories NeurIPS 2024 + + +
+ We consider the problem of segmenting objects in videos based on their motion +and no other forms of supervision. Prior work has often approached this problem +by using the principle of common fate, namely the fact that the motion of +points that belong to the same object is strongly correlated. However, most +authors have only considered instantaneous motion from optical flow. In this +work, we present a way to train a segmentation network using long-term point +trajectories as a supervisory signal to complement optical flow. The key +difficulty is that long-term motion, unlike instantaneous motion, is difficult +to model -- any parametric approximation is unlikely to capture complex motion +patterns over long periods of time. We instead draw inspiration from subspace +clustering approaches, proposing a loss function that seeks to group the +trajectories into low-rank matrices where the motion of object points can be +approximately explained as a linear combination of other point tracks. Our +method outperforms the prior art on motion-based segmentation, which shows the +utility of long-term motion and the effectiveness of our formulation. + +
+
+ comment: NeurIPS 2024 Spotlight. Project + https://www.robots.ox.ac.uk/~vgg/research/lrtl/ +
+
+
+
+
+ + ☆ GPS as a Control Signal for Image Generation + + +
+ We show that the GPS tags contained in photo metadata provide a useful +control signal for image generation. We train GPS-to-image models and use them +for tasks that require a fine-grained understanding of how images vary within a +city. In particular, we train a diffusion model to generate images conditioned +on both GPS and text. The learned model generates images that capture the +distinctive appearance of different neighborhoods, parks, and landmarks. We +also extract 3D models from 2D GPS-to-image models through score distillation +sampling, using GPS conditioning to constrain the appearance of the +reconstruction from each viewpoint. Our evaluations suggest that our +GPS-conditioned models successfully learn to generate images that vary based on +location, and that GPS conditioning improves estimated 3D structure. + +
+
+
+
+
+ + ☆ Taming Teacher Forcing for Masked Autoregressive Video Generation + + +
+ We introduce MAGI, a hybrid video generation framework that combines masked +modeling for intra-frame generation with causal modeling for next-frame +generation. Our key innovation, Complete Teacher Forcing (CTF), conditions +masked frames on complete observation frames rather than masked ones (namely +Masked Teacher Forcing, MTF), enabling a smooth transition from token-level +(patch-level) to frame-level autoregressive generation. CTF significantly +outperforms MTF, achieving a +23% improvement in FVD scores on first-frame +conditioned video prediction. To address issues like exposure bias, we employ +targeted training strategies, setting a new benchmark in autoregressive video +generation. Experiments show that MAGI can generate long, coherent video +sequences exceeding 100 frames, even when trained on as few as 16 frames, +highlighting its potential for scalable, high-quality video generation. + +
+
+ comment: 12 pages, 9 figures +
+
+
+
+
+ + ☆ Continuous 3D Perception Model with Persistent State + + +
+ We present a unified framework capable of solving a broad range of 3D tasks. +Our approach features a stateful recurrent model that continuously updates its +state representation with each new observation. Given a stream of images, this +evolving state can be used to generate metric-scale pointmaps (per-pixel 3D +points) for each new input in an online fashion. These pointmaps reside within +a common coordinate system, and can be accumulated into a coherent, dense scene +reconstruction that updates as new images arrive. Our model, called CUT3R +(Continuous Updating Transformer for 3D Reconstruction), captures rich priors +of real-world scenes: not only can it predict accurate pointmaps from image +observations, but it can also infer unseen regions of the scene by probing at +virtual, unobserved views. Our method is simple yet highly flexible, naturally +accepting varying lengths of images that may be either video streams or +unordered photo collections, containing both static and dynamic content. We +evaluate our method on various 3D/4D tasks and demonstrate competitive or +state-of-the-art performance in each. Project Page: https://cut3r.github.io/ + +
+
+
+
+
+ + ☆ InternVideo2.5: Empowering Video MLLMs with Long and Rich Context + Modeling + + +
+ This paper aims to improve the performance of video multimodal large language +models (MLLM) via long and rich context (LRC) modeling. As a result, we develop +a new version of InternVideo2.5 with a focus on enhancing the original MLLMs' +ability to perceive fine-grained details and capture long-form temporal +structure in videos. Specifically, our approach incorporates dense vision task +annotations into MLLMs using direct preference optimization and develops +compact spatiotemporal representations through adaptive hierarchical token +compression. Experimental results demonstrate this unique design of LRC greatly +improves the results of video MLLM in mainstream video understanding benchmarks +(short & long), enabling the MLLM to memorize significantly longer video inputs +(at least 6x longer than the original), and master specialized vision +capabilities like object tracking and segmentation. Our work highlights the +importance of multimodal context richness (length and fineness) in empowering +MLLM's innate abilites (focus and memory), providing new insights for future +research on video MLLM. Code and models are available at +https://github.com/OpenGVLab/InternVideo/tree/main/InternVideo2.5 + +
+
+ comment: technical report +
+
+
+
+
+ + ☆ CCESAR: Coastline Classification-Extraction From SAR Images Using + CNN-U-Net Combination + + +
+ In this article, we improve the deep learning solution for coastline +extraction from Synthetic Aperture Radar (SAR) images by proposing a two-stage +model involving image classification followed by segmentation. We hypothesize +that a single segmentation model usually used for coastline detection is +insufficient to characterize different coastline types. We demonstrate that the +need for a two-stage workflow prevails through different compression levels of +these images. Our results from experiments using a combination of CNN and U-Net +models on Sentinel-1 images show that the two-stage workflow, coastline +classification-extraction from SAR images (CCESAR) outperforms a single U-Net +segmentation model. + +
+
+
+
+
+ + ☆ DiffDoctor: Diagnosing Image Diffusion Models Before Treating + + +
+ In spite of the recent progress, image diffusion models still produce +artifacts. A common solution is to refine an established model with a quality +assessment system, which generally rates an image in its entirety. In this +work, we believe problem-solving starts with identification, yielding the +request that the model should be aware of not just the presence of defects in +an image, but their specific locations. Motivated by this, we propose +DiffDoctor, a two-stage pipeline to assist image diffusion models in generating +fewer artifacts. Concretely, the first stage targets developing a robust +artifact detector, for which we collect a dataset of over 1M flawed synthesized +images and set up an efficient human-in-the-loop annotation process, +incorporating a carefully designed class-balance strategy. The learned artifact +detector is then involved in the second stage to tune the diffusion model +through assigning a per-pixel confidence map for each synthesis. Extensive +experiments on text-to-image diffusion models demonstrate the effectiveness of +our artifact detector as well as the soundness of our diagnose-then-treat +design. + +
+
+ comment: 8 pages of main body and 2 pages of references, 9 figures, 2 tables +
+
+
+
+
+ + ☆ Parallel Sequence Modeling via Generalized Spatial Propagation Network + + +
+ We present the Generalized Spatial Propagation Network (GSPN), a new +attention mechanism optimized for vision tasks that inherently captures 2D +spatial structures. Existing attention models, including transformers, linear +attention, and state-space models like Mamba, process multi-dimensional data as +1D sequences, compromising spatial coherence and efficiency. GSPN overcomes +these limitations by directly operating on spatially coherent image data and +forming dense pairwise connections through a line-scan approach. Central to +GSPN is the Stability-Context Condition, which ensures stable, context-aware +propagation across 2D sequences and reduces the effective sequence length to +$\sqrt{N}$ for a square map with N elements, significantly enhancing +computational efficiency. With learnable, input-dependent weights and no +reliance on positional embeddings, GSPN achieves superior spatial fidelity and +state-of-the-art performance in vision tasks, including ImageNet +classification, class-guided image generation, and text-to-image generation. +Notably, GSPN accelerates SD-XL with softmax-attention by over $84\times$ when +generating 16K images. + +
+
+ comment: Project page: http://whj363636.github.io/GSPN/ +
+
+
+
+
+ + ☆ MMVU: Measuring Expert-Level Multi-Discipline Video Understanding + + +
+ We introduce MMVU, a comprehensive expert-level, multi-discipline benchmark +for evaluating foundation models in video understanding. MMVU includes 3,000 +expert-annotated questions spanning 27 subjects across four core disciplines: +Science, Healthcare, Humanities & Social Sciences, and Engineering. Compared to +prior benchmarks, MMVU features three key advancements. First, it challenges +models to apply domain-specific knowledge and perform expert-level reasoning to +analyze specialized-domain videos, moving beyond the basic visual perception +typically assessed in current video benchmarks. Second, each example is +annotated by human experts from scratch. We implement strict data quality +controls to ensure the high quality of the dataset. Finally, each example is +enriched with expert-annotated reasoning rationals and relevant domain +knowledge, facilitating in-depth analysis. We conduct an extensive evaluation +of 32 frontier multimodal foundation models on MMVU. The latest +System-2-capable models, o1 and Gemini 2.0 Flash Thinking, achieve the highest +performance among the tested models. However, they still fall short of matching +human expertise. Through in-depth error analyses and case studies, we offer +actionable insights for future advancements in expert-level, +knowledge-intensive video understanding for specialized domains. + +
+
+
+
+
+ + ☆ Video Depth Anything: Consistent Depth Estimation for Super-Long Videos + + +
+ Depth Anything has achieved remarkable success in monocular depth estimation +with strong generalization ability. However, it suffers from temporal +inconsistency in videos, hindering its practical applications. Various methods +have been proposed to alleviate this issue by leveraging video generation +models or introducing priors from optical flow and camera poses. Nonetheless, +these methods are only applicable to short videos (< 10 seconds) and require a +trade-off between quality and computational efficiency. We propose Video Depth +Anything for high-quality, consistent depth estimation in super-long videos +(over several minutes) without sacrificing efficiency. We base our model on +Depth Anything V2 and replace its head with an efficient spatial-temporal head. +We design a straightforward yet effective temporal consistency loss by +constraining the temporal depth gradient, eliminating the need for additional +geometric priors. The model is trained on a joint dataset of video depth and +unlabeled images, similar to Depth Anything V2. Moreover, a novel +key-frame-based strategy is developed for long video inference. Experiments +show that our model can be applied to arbitrarily long videos without +compromising quality, consistency, or generalization ability. Comprehensive +evaluations on multiple video benchmarks demonstrate that our approach sets a +new state-of-the-art in zero-shot video depth estimation. We offer models of +different scales to support a range of scenarios, with our smallest model +capable of real-time performance at 30 FPS. + +
+
+
+
+
+ + ☆ DARB-Splatting: Generalizing Splatting with Decaying Anisotropic Radial + Basis Functions + + +
+ Splatting-based 3D reconstruction methods have gained popularity with the +advent of 3D Gaussian Splatting, efficiently synthesizing high-quality novel +views. These methods commonly resort to using exponential family functions, +such as the Gaussian function, as reconstruction kernels due to their +anisotropic nature, ease of projection, and differentiability in rasterization. +However, the field remains restricted to variations within the exponential +family, leaving generalized reconstruction kernels largely underexplored, +partly due to the lack of easy integrability in 3D to 2D projections. In this +light, we show that a class of decaying anisotropic radial basis functions +(DARBFs), which are non-negative functions of the Mahalanobis distance, +supports splatting by approximating the Gaussian function's closed-form +integration advantage. With this fresh perspective, we demonstrate up to 34% +faster convergence during training and a 15% reduction in memory consumption +across various DARB reconstruction kernels, while maintaining comparable PSNR, +SSIM, and LPIPS results. We will make the code available. + +
+
+ comment: Link to the project page: + https://randomnerds.github.io/darbs.github.io/ +
+
+
+
+
+ + ☆ InternLM-XComposer2.5-Reward: A Simple Yet Effective Multi-Modal Reward + Model + + +
+ Despite the promising performance of Large Vision Language Models (LVLMs) in +visual understanding, they occasionally generate incorrect outputs. While +reward models (RMs) with reinforcement learning or test-time scaling offer the +potential for improving generation quality, a critical gap remains: publicly +available multi-modal RMs for LVLMs are scarce, and the implementation details +of proprietary models are often unclear. We bridge this gap with +InternLM-XComposer2.5-Reward (IXC-2.5-Reward), a simple yet effective +multi-modal reward model that aligns LVLMs with human preferences. To ensure +the robustness and versatility of IXC-2.5-Reward, we set up a high-quality +multi-modal preference corpus spanning text, image, and video inputs across +diverse domains, such as instruction following, general understanding, +text-rich documents, mathematical reasoning, and video understanding. +IXC-2.5-Reward achieves excellent results on the latest multi-modal reward +model benchmark and shows competitive performance on text-only reward model +benchmarks. We further demonstrate three key applications of IXC-2.5-Reward: +(1) Providing a supervisory signal for RL training. We integrate IXC-2.5-Reward +with Proximal Policy Optimization (PPO) yields IXC-2.5-Chat, which shows +consistent improvements in instruction following and multi-modal open-ended +dialogue; (2) Selecting the best response from candidate responses for +test-time scaling; and (3) Filtering outlier or noisy samples from existing +image and video instruction tuning training data. To ensure reproducibility and +facilitate further research, we have open-sourced all model weights and +training recipes at https://github.com/InternLM/InternLM-XComposer + +
+
+ comment: Tech Report +
+
+
+
+
+ + ☆ Vision-Language Models for Automated Chest X-ray Interpretation: + Leveraging ViT and GPT-2 + + +
+ Radiology plays a pivotal role in modern medicine due to its non-invasive +diagnostic capabilities. However, the manual generation of unstructured medical +reports is time consuming and prone to errors. It creates a significant +bottleneck in clinical workflows. Despite advancements in AI-generated +radiology reports, challenges remain in achieving detailed and accurate report +generation. In this study we have evaluated different combinations of +multimodal models that integrate Computer Vision and Natural Language +Processing to generate comprehensive radiology reports. We employed a +pretrained Vision Transformer (ViT-B16) and a SWIN Transformer as the image +encoders. The BART and GPT-2 models serve as the textual decoders. We used +Chest X-ray images and reports from the IU-Xray dataset to evaluate the +usability of the SWIN Transformer-BART, SWIN Transformer-GPT-2, ViT-B16-BART +and ViT-B16-GPT-2 models for report generation. We aimed at finding the best +combination among the models. The SWIN-BART model performs as the +best-performing model among the four models achieving remarkable results in +almost all the evaluation metrics like ROUGE, BLEU and BERTScore. + +
+
+ comment: Preprint, manuscript under-review +
+
+
+
+
+ + ☆ Cinepro: Robust Training of Foundation Models for Cancer Detection in + Prostate Ultrasound Cineloops + + +
+ Prostate cancer (PCa) detection using deep learning (DL) models has shown +potential for enhancing real-time guidance during biopsies. However, prostate +ultrasound images lack pixel-level cancer annotations, introducing label noise. +Current approaches often focus on limited regions of interest (ROIs), +disregarding anatomical context necessary for accurate diagnosis. Foundation +models can overcome this limitation by analyzing entire images to capture +global spatial relationships; however, they still encounter challenges stemming +from the weak labels associated with coarse pathology annotations in ultrasound +data. We introduce Cinepro, a novel framework that strengthens foundation +models' ability to localize PCa in ultrasound cineloops. Cinepro adapts robust +training by integrating the proportion of cancer tissue reported by pathology +in a biopsy core into its loss function to address label noise, providing a +more nuanced supervision. Additionally, it leverages temporal data across +multiple frames to apply robust augmentations, enhancing the model's ability to +learn stable cancer-related features. Cinepro demonstrates superior performance +on a multi-center prostate ultrasound dataset, achieving an AUROC of 77.1% and +a balanced accuracy of 83.8%, surpassing current benchmarks. These findings +underscore Cinepro's promise in advancing foundation models for weakly labeled +ultrasound data. + +
+
+ comment: accepted to IEEE ISBI 2025 +
+
+
+
+
+ + ☆ VARGPT: Unified Understanding and Generation in a Visual Autoregressive + Multimodal Large Language Model + + +
+ We present VARGPT, a novel multimodal large language model (MLLM) that +unifies visual understanding and generation within a single autoregressive +framework. VARGPT employs a next-token prediction paradigm for visual +understanding and a next-scale prediction paradigm for visual autoregressive +generation. VARGPT innovatively extends the LLaVA architecture, achieving +efficient scale-wise autoregressive visual generation within MLLMs while +seamlessly accommodating mixed-modal input and output within a single model +framework. Our VARGPT undergoes a three-stage unified training process on +specially curated datasets, comprising a pre-training phase and two mixed +visual instruction-tuning phases. The unified training strategy are designed to +achieve alignment between visual and textual features, enhance instruction +following for both understanding and generation, and improve visual generation +quality, respectively. Despite its LLAVA-based architecture for multimodel +understanding, VARGPT significantly outperforms LLaVA-1.5 across various +vision-centric benchmarks, such as visual question-answering and reasoning +tasks. Notably, VARGPT naturally supports capabilities in autoregressive visual +generation and instruction-to-image synthesis, showcasing its versatility in +both visual understanding and generation tasks. Project page is at: +\url{https://vargpt-1.github.io/} + +
+
+
+
+
+ + ☆ UI-TARS: Pioneering Automated GUI Interaction with Native Agents + + +
+ This paper introduces UI-TARS, a native GUI agent model that solely perceives +the screenshots as input and performs human-like interactions (e.g., keyboard +and mouse operations). Unlike prevailing agent frameworks that depend on +heavily wrapped commercial models (e.g., GPT-4o) with expert-crafted prompts +and workflows, UI-TARS is an end-to-end model that outperforms these +sophisticated frameworks. Experiments demonstrate its superior performance: +UI-TARS achieves SOTA performance in 10+ GUI agent benchmarks evaluating +perception, grounding, and GUI task execution. Notably, in the OSWorld +benchmark, UI-TARS achieves scores of 24.6 with 50 steps and 22.7 with 15 +steps, outperforming Claude (22.0 and 14.9 respectively). In AndroidWorld, +UI-TARS achieves 46.6, surpassing GPT-4o (34.5). UI-TARS incorporates several +key innovations: (1) Enhanced Perception: leveraging a large-scale dataset of +GUI screenshots for context-aware understanding of UI elements and precise +captioning; (2) Unified Action Modeling, which standardizes actions into a +unified space across platforms and achieves precise grounding and interaction +through large-scale action traces; (3) System-2 Reasoning, which incorporates +deliberate reasoning into multi-step decision making, involving multiple +reasoning patterns such as task decomposition, reflection thinking, milestone +recognition, etc. (4) Iterative Training with Reflective Online Traces, which +addresses the data bottleneck by automatically collecting, filtering, and +reflectively refining new interaction traces on hundreds of virtual machines. +Through iterative training and reflection tuning, UI-TARS continuously learns +from its mistakes and adapts to unforeseen situations with minimal human +intervention. We also analyze the evolution path of GUI agents to guide the +further development of this domain. + +
+
+
+
+
+ + ☆ Deep Learning Based Segmentation of Blood Vessels from H&E Stained + Oesophageal Adenocarcinoma Whole-Slide Images + + +
+ Blood vessels (BVs) play a critical role in the Tumor Micro-Environment +(TME), potentially influencing cancer progression and treatment response. +However, manually quantifying BVs in Hematoxylin and Eosin (H&E) stained images +is challenging and labor-intensive due to their heterogeneous appearances. We +propose a novel approach of constructing guiding maps to improve the +performance of state-of-the-art segmentation models for BV segmentation, the +guiding maps encourage the models to learn representative features of BVs. This +is particularly beneficial for computational pathology, where labeled training +data is often limited and large models are prone to overfitting. We have +quantitative and qualitative results to demonstrate the efficacy of our +approach in improving segmentation accuracy. In future, we plan to validate +this method to segment BVs across various tissue types and investigate the role +of cellular structures in relation to BVs in the TME. + +
+
+ comment: Accepted by ISBI 2025 +
+
+
+
+
+ + ☆ Metric for Evaluating Performance of Reference-Free Demorphing Methods + + +
+ A facial morph is an image created by combining two (or more) face images +pertaining to two (or more) distinct identities. Reference-free face demorphing +inverts the process and tries to recover the face images constituting a facial +morph without using any other information. However, there is no consensus on +the evaluation metrics to be used to evaluate and compare such demorphing +techniques. In this paper, we first analyze the shortcomings of the demorphing +metrics currently used in the literature. We then propose a new metric called +biometrically cross-weighted IQA that overcomes these issues and extensively +benchmark current methods on the proposed metric to show its efficacy. +Experiments on three existing demorphing methods and six datasets on two +commonly used face matchers validate the efficacy of our proposed metric. + +
+
+
+
+
+ + ☆ BlanketGen2-Fit3D: Synthetic Blanket Augmentation Towards Improving + Real-World In-Bed Blanket Occluded Human Pose Estimation + + +
+ Human Pose Estimation (HPE) from monocular RGB images is crucial for clinical +in-bed skeleton-based action recognition, however, it poses unique challenges +for HPE models due to the frequent presence of blankets occluding the person, +while labeled HPE data in this scenario is scarce. To address this we introduce +BlanketGen2-Fit3D (BG2-Fit3D), an augmentation of Fit3D dataset that contains +1,217,312 frames with synthetic photo-realistic blankets. To generate it we +used BlanketGen2, our new and improved version of our BlanketGen pipeline that +simulates synthetic blankets using ground-truth Skinned Multi-Person Linear +model (SMPL) meshes and then renders them as transparent images that can be +layered on top of the original frames. This dataset was used in combination +with the original Fit3D to finetune the ViTPose-B HPE model, to evaluate +synthetic blanket augmentation effectiveness. The trained models were further +evaluated on a real-world blanket occluded in-bed HPE dataset (SLP dataset). +Comparing architectures trained on only Fit3D with the ones trained with our +synthetic blanket augmentation the later improved pose estimation performance +on BG2-Fit3D, the synthetic blanket occluded dataset significantly to (0.977 +Percentage of Correct Keypoints (PCK), 0.149 Normalized Mean Error (NME)) with +an absolute 4.4% PCK increase. Furthermore, the test results on SLP +demonstrated the utility of synthetic data augmentation by improving +performance by an absolute 2.3% PCK, on real-world images with the poses +occluded by real blankets. These results show synthetic blanket augmentation +has the potential to improve in-bed blanket occluded HPE from RGB images. The +dataset as well as the code will be made available to the public. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ Sublinear Variational Optimization of Gaussian Mixture Models with + Millions to Billions of Parameters + + +
+ Gaussian Mixture Models (GMMs) range among the most frequently used machine +learning models. However, training large, general GMMs becomes computationally +prohibitive for datasets with many data points $N$ of high-dimensionality $D$. +For GMMs with arbitrary covariances, we here derive a highly efficient +variational approximation, which is integrated with mixtures of factor +analyzers (MFAs). For GMMs with $C$ components, our proposed algorithm +significantly reduces runtime complexity per iteration from +$\mathcal{O}(NCD^2)$ to a complexity scaling linearly with $D$ and remaining +constant w.r.t. $C$. Numerical validation of this theoretical complexity +reduction then shows the following: the distance evaluations required for the +entire GMM optimization process scale sublinearly with $NC$. On large-scale +benchmarks, this sublinearity results in speed-ups of an order-of-magnitude +compared to the state-of-the-art. As a proof of concept, we train GMMs with +over 10 billion parameters on about 100 million images, and observe training +times of approximately nine hours on a single state-of-the-art CPU. + +
+
+ comment: 22 pages, 6 figures (and 17 pages, 3 figures in Appendix) +
+
+
+
+
+ + ☆ RALAD: Bridging the Real-to-Sim Domain Gap in Autonomous Driving with + Retrieval-Augmented Learning + + +
+ In the pursuit of robust autonomous driving systems, models trained on +real-world datasets often struggle to adapt to new environments, particularly +when confronted with corner cases such as extreme weather conditions. +Collecting these corner cases in the real world is non-trivial, which +necessitates the use of simulators for validation. However,the high +computational cost and the domain gap in data distribution have hindered the +seamless transition between real and simulated driving scenarios. To tackle +this challenge, we propose Retrieval-Augmented Learning for Autonomous Driving +(RALAD), a novel framework designed to bridge the real-to-sim gap at a low +cost. RALAD features three primary designs, including (1) domain adaptation via +an enhanced Optimal Transport (OT) method that accounts for both individual and +grouped image distances, (2) a simple and unified framework that can be applied +to various models, and (3) efficient fine-tuning techniques that freeze the +computationally expensive layers while maintaining robustness. Experimental +results demonstrate that RALAD compensates for the performance degradation in +simulated environments while maintaining accuracy in real-world scenarios +across three different models. Taking Cross View as an example, the mIOU and +mAP metrics in real-world scenarios remain stable before and after RALAD +fine-tuning, while in simulated environments,the mIOU and mAP metrics are +improved by 10.30% and 12.29%, respectively. Moreover, the re-training cost of +our approach is reduced by approximately 88.1%. Our code is available at +https://github.com/JiachengZuo/RALAD.git. + +
+
+
+
+
+ + ☆ Towards Accurate Unified Anomaly Segmentation + + +
+ Unsupervised anomaly detection (UAD) from images strives to model normal data +distributions, creating discriminative representations to distinguish and +precisely localize anomalies. Despite recent advancements in the efficient and +unified one-for-all scheme, challenges persist in accurately segmenting +anomalies for further monitoring. Moreover, this problem is obscured by the +widely-used AUROC metric under imbalanced UAD settings. This motivates us to +emphasize the significance of precise segmentation of anomaly pixels using pAP +and DSC as metrics. To address the unsolved segmentation task, we introduce the +Unified Anomaly Segmentation (UniAS). UniAS presents a multi-level hybrid +pipeline that progressively enhances normal information from coarse to fine, +incorporating a novel multi-granularity gated CNN (MGG-CNN) into Transformer +layers to explicitly aggregate local details from different granularities. +UniAS achieves state-of-the-art anomaly segmentation performance, attaining +65.12/59.33 and 40.06/32.50 in pAP/DSC on the MVTec-AD and VisA datasets, +respectively, surpassing previous methods significantly. The codes are shared +at https://github.com/Mwxinnn/UniAS. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ Regressor-Guided Image Editing Regulates Emotional Response to Reduce + Online Engagement + + +
+ Emotions are known to mediate the relationship between users' content +consumption and their online engagement, with heightened emotional intensity +leading to increased engagement. Building on this insight, we propose three +regressor-guided image editing approaches aimed at diminishing the emotional +impact of images. These include (i) a parameter optimization approach based on +global image transformations known to influence emotions, (ii) an optimization +approach targeting the style latent space of a generative adversarial network, +and (iii) a diffusion-based approach employing classifier guidance and +classifier-free guidance. Our findings demonstrate that approaches can +effectively alter the emotional properties of images while maintaining high +visual quality. Optimization-based methods primarily adjust low-level +properties like color hues and brightness, whereas the diffusion-based approach +introduces semantic changes, such as altering appearance or facial expressions. +Notably, results from a behavioral study reveal that only the diffusion-based +approach successfully elicits changes in viewers' emotional responses while +preserving high perceived image quality. In future work, we will investigate +the impact of these image adaptations on internet user behavior. + +
+
+ comment: 39 pages, 22 figures +
+
+
+
+
+ + ☆ With Great Backbones Comes Great Adversarial Transferability + + +
+ Advances in self-supervised learning (SSL) for machine vision have improved +representation robustness and model performance, giving rise to pre-trained +backbones like \emph{ResNet} and \emph{ViT} models tuned with SSL methods such +as \emph{SimCLR}. Due to the computational and data demands of pre-training, +the utilization of such backbones becomes a strenuous necessity. However, +employing these backbones may inherit vulnerabilities to adversarial attacks. +While adversarial robustness has been studied under \emph{white-box} and +\emph{black-box} settings, the robustness of models tuned on pre-trained +backbones remains largely unexplored. Additionally, the role of tuning +meta-information in mitigating exploitation risks is unclear. This work +systematically evaluates the adversarial robustness of such models across +$20,000$ combinations of tuning meta-information, including fine-tuning +techniques, backbone families, datasets, and attack types. We propose using +proxy models to transfer attacks, simulating varying levels of target knowledge +by fine-tuning these proxies with diverse configurations. Our findings reveal +that proxy-based attacks approach the effectiveness of \emph{white-box} +methods, even with minimal tuning knowledge. We also introduce a naive +"backbone attack," leveraging only the backbone to generate adversarial +samples, which outperforms \emph{black-box} attacks and rivals \emph{white-box} +methods, highlighting critical risks in model-sharing practices. Finally, our +ablations reveal how increasing tuning meta-information impacts attack +transferability, measuring each meta-information combination. + +
+
+
+
+
+ + ☆ Benchmarking Image Perturbations for Testing Automated Driving + Assistance Systems + + +
+ Advanced Driver Assistance Systems (ADAS) based on deep neural networks +(DNNs) are widely used in autonomous vehicles for critical perception tasks +such as object detection, semantic segmentation, and lane recognition. However, +these systems are highly sensitive to input variations, such as noise and +changes in lighting, which can compromise their effectiveness and potentially +lead to safety-critical failures. + This study offers a comprehensive empirical evaluation of image +perturbations, techniques commonly used to assess the robustness of DNNs, to +validate and improve the robustness and generalization of ADAS perception +systems. We first conducted a systematic review of the literature, identifying +38 categories of perturbations. Next, we evaluated their effectiveness in +revealing failures in two different ADAS, both at the component and at the +system level. Finally, we explored the use of perturbation-based data +augmentation and continuous learning strategies to improve ADAS adaptation to +new operational design domains. Our results demonstrate that all categories of +image perturbations successfully expose robustness issues in ADAS and that the +use of dataset augmentation and continuous learning significantly improves ADAS +performance in novel, unseen environments. + +
+
+ comment: Accepted for publication at the 18th IEEE International Conference on + Software Testing, Verification and Validation (ICST 2025) +
+
+
+
+
+ + ☆ VipDiff: Towards Coherent and Diverse Video Inpainting via Training-free + Denoising Diffusion Models + + +
+ Recent video inpainting methods have achieved encouraging improvements by +leveraging optical flow to guide pixel propagation from reference frames either +in the image space or feature space. However, they would produce severe +artifacts in the mask center when the masked area is too large and no pixel +correspondences can be found for the center. Recently, diffusion models have +demonstrated impressive performance in generating diverse and high-quality +images, and have been exploited in a number of works for image inpainting. +These methods, however, cannot be applied directly to videos to produce +temporal-coherent inpainting results. In this paper, we propose a training-free +framework, named VipDiff, for conditioning diffusion model on the reverse +diffusion process to produce temporal-coherent inpainting results without +requiring any training data or fine-tuning the pre-trained diffusion models. +VipDiff takes optical flow as guidance to extract valid pixels from reference +frames to serve as constraints in optimizing the randomly sampled Gaussian +noise, and uses the generated results for further pixel propagation and +conditional generation. VipDiff also allows for generating diverse video +inpainting results over different sampled noise. Experiments demonstrate that +VipDiff can largely outperform state-of-the-art video inpainting methods in +terms of both spatial-temporal coherence and fidelity. + +
+
+ comment: 10 pages, 5 Figures (Accepted at WACV 2025) +
+
+
+
+
+ + ☆ CBVLM: Training-free Explainable Concept-based Large Vision Language + Models for Medical Image Classification + + +
+ The main challenges limiting the adoption of deep learning-based solutions in +medical workflows are the availability of annotated data and the lack of +interpretability of such systems. Concept Bottleneck Models (CBMs) tackle the +latter by constraining the final disease prediction on a set of predefined and +human-interpretable concepts. However, the increased interpretability achieved +through these concept-based explanations implies a higher annotation burden. +Moreover, if a new concept needs to be added, the whole system needs to be +retrained. Inspired by the remarkable performance shown by Large +Vision-Language Models (LVLMs) in few-shot settings, we propose a simple, yet +effective, methodology, CBVLM, which tackles both of the aforementioned +challenges. First, for each concept, we prompt the LVLM to answer if the +concept is present in the input image. Then, we ask the LVLM to classify the +image based on the previous concept predictions. Moreover, in both stages, we +incorporate a retrieval module responsible for selecting the best examples for +in-context learning. By grounding the final diagnosis on the predicted +concepts, we ensure explainability, and by leveraging the few-shot capabilities +of LVLMs, we drastically lower the annotation cost. We validate our approach +with extensive experiments across four medical datasets and twelve LVLMs (both +generic and medical) and show that CBVLM consistently outperforms CBMs and +task-specific supervised methods without requiring any training and using just +a few annotated examples. More information on our project page: +https://cristianopatricio.github.io/CBVLM/. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ mmCooper: A Multi-agent Multi-stage Communication-efficient and + Collaboration-robust Cooperative Perception Framework + + +
+ Collaborative perception significantly enhances individual vehicle perception +performance through the exchange of sensory information among agents. However, +real-world deployment faces challenges due to bandwidth constraints and +inevitable calibration errors during information exchange. To address these +issues, we propose mmCooper, a novel multi-agent, multi-stage, +communication-efficient, and collaboration-robust cooperative perception +framework. Our framework leverages a multi-stage collaboration strategy that +dynamically and adaptively balances intermediate- and late-stage information to +share among agents, enhancing perceptual performance while maintaining +communication efficiency. To support robust collaboration despite potential +misalignments and calibration errors, our framework captures multi-scale +contextual information for robust fusion in the intermediate stage and +calibrates the received detection results to improve accuracy in the late +stage. We validate the effectiveness of mmCooper through extensive experiments +on real-world and simulated datasets. The results demonstrate the superiority +of our proposed framework and the effectiveness of each component. + +
+
+
+
+
+ + ☆ HAC++: Towards 100X Compression of 3D Gaussian Splatting TPAMI + + +
+ 3D Gaussian Splatting (3DGS) has emerged as a promising framework for novel +view synthesis, boasting rapid rendering speed with high fidelity. However, the +substantial Gaussians and their associated attributes necessitate effective +compression techniques. Nevertheless, the sparse and unorganized nature of the +point cloud of Gaussians (or anchors in our paper) presents challenges for +compression. To achieve a compact size, we propose HAC++, which leverages the +relationships between unorganized anchors and a structured hash grid, utilizing +their mutual information for context modeling. Additionally, HAC++ captures +intra-anchor contextual relationships to further enhance compression +performance. To facilitate entropy coding, we utilize Gaussian distributions to +precisely estimate the probability of each quantized attribute, where an +adaptive quantization module is proposed to enable high-precision quantization +of these attributes for improved fidelity restoration. Moreover, we incorporate +an adaptive masking strategy to eliminate invalid Gaussians and anchors. +Overall, HAC++ achieves a remarkable size reduction of over 100X compared to +vanilla 3DGS when averaged on all datasets, while simultaneously improving +fidelity. It also delivers more than 20X size reduction compared to +Scaffold-GS. Our code is available at +https://github.com/YihangChen-ee/HAC-plus. + +
+
+ comment: IEEE TPAMI Submission. This paper is an extension of HAC at + arXiv:2403.14530 (ECCV 2024) +
+
+
+
+
+ + ☆ Memory Storyboard: Leveraging Temporal Segmentation for Streaming + Self-Supervised Learning from Egocentric Videos + + +
+ Self-supervised learning holds the promise to learn good representations from +real-world continuous uncurated data streams. However, most existing works in +visual self-supervised learning focus on static images or artificial data +streams. Towards exploring a more realistic learning substrate, we investigate +streaming self-supervised learning from long-form real-world egocentric video +streams. Inspired by the event segmentation mechanism in human perception and +memory, we propose "Memory Storyboard" that groups recent past frames into +temporal segments for more effective summarization of the past visual streams +for memory replay. To accommodate efficient temporal segmentation, we propose a +two-tier memory hierarchy: the recent past is stored in a short-term memory, +and the storyboard temporal segments are then transferred to a long-term +memory. Experiments on real-world egocentric video datasets including SAYCam +and KrishnaCam show that contrastive learning objectives on top of storyboard +frames result in semantically meaningful representations which outperform those +produced by state-of-the-art unsupervised continual learning methods. + +
+
+ comment: 20 pages, 8 figures +
+
+
+
+
+ + ☆ Video Deblurring by Sharpness Prior Detection and Edge Information + + +
+ Video deblurring is essential task for autonomous driving, facial +recognition, and security surveillance. Traditional methods directly estimate +motion blur kernels, often introducing artifacts and leading to poor results. +Recent approaches utilize the detection of sharp frames within video sequences +to enhance deblurring. However, existing datasets rely on fixed number of sharp +frames, which may be too restrictive for some applications and may introduce a +bias during model training. To address these limitations and enhance domain +adaptability, this work first introduces GoPro Random Sharp (GoProRS), a new +dataset where the the frequency of sharp frames within the sequence is +customizable, allowing more diverse training and testing scenarios. +Furthermore, it presents a novel video deblurring model, called SPEINet, that +integrates sharp frame features into blurry frame reconstruction through an +attention-based encoder-decoder architecture, a lightweight yet robust sharp +frame detection and an edge extraction phase. Extensive experimental results +demonstrate that SPEINet outperforms state-of-the-art methods across multiple +datasets, achieving an average of +3.2% PSNR improvement over recent +techniques. Given such promising results, we believe that both the proposed +model and dataset pave the way for future advancements in video deblurring +based on the detection of sharp frames. + +
+
+ comment: Under review in Pattern Recognition +
+
+
+
+
+ + ☆ Quality Enhancement of Radiographic X-ray Images by Interpretable + Mapping + + +
+ X-ray imaging is the most widely used medical imaging modality. However, in +the common practice, inconsistency in the initial presentation of X-ray images +is a common complaint by radiologists. Different patient positions, patient +habitus and scanning protocols can lead to differences in image presentations, +e.g., differences in brightness and contrast globally or regionally. To +compensate for this, additional work will be executed by clinical experts to +adjust the images to the desired presentation, which can be time-consuming. +Existing deep-learning-based end-to-end solutions can automatically correct +images with promising performances. Nevertheless, these methods are hard to be +interpreted and difficult to be understood by clinical experts. In this +manuscript, a novel interpretable mapping method by deep learning is proposed, +which automatically enhances the image brightness and contrast globally and +locally. Meanwhile, because the model is inspired by the workflow of the +brightness and contrast manipulation, it can provide interpretable pixel maps +for explaining the motivation of image enhancement. The experiment on the +clinical datasets show the proposed method can provide consistent brightness +and contrast correction on X-ray images with accuracy of 24.75 dB PSNR and +0.8431 SSIM. + +
+
+ comment: SPIE Medical Imaging 2025 +
+
+
+
+
+ + ☆ Zero-shot Bias Correction: Efficient MR Image Inhomogeneity Reduction + Without Any Data + + +
+ In recent years, deep neural networks for image inhomogeneity reduction have +shown promising results. However, current methods with (un)supervised solutions +require preparing a training dataset, which is expensive and laborious for data +collection. In this work, we demonstrate a novel zero-shot deep neural +networks, which requires no data for pre-training and dedicated assumption of +the bias field. The designed light-weight CNN enables an efficient zero-shot +adaptation for bias-corrupted image correction. Our method provides a novel +solution to mitigate the biased corrupted image as iterative homogeneity +refinement, which therefore ensures the considered issue can be solved easier +with stable convergence of zero-shot optimization. Extensive comparison on +different datasets show that the proposed method performs better than current +data-free N4 methods in both efficiency and accuracy. + +
+
+ comment: Accepted by ISBI 2025. Supported by IHI PREDICTOM Project +
+
+
+
+
+ + ☆ Investigating Market Strength Prediction with CNNs on Candlestick Chart + Images + + +
+ This paper investigates predicting market strength solely from candlestick +chart images to assist investment decisions. The core research problem is +developing an effective computer vision-based model using raw candlestick +visuals without time-series data. We specifically analyze the impact of +incorporating candlestick patterns that were detected by YOLOv8. The study +implements two approaches: pure CNN on chart images and a Decomposer +architecture detecting patterns. Experiments utilize diverse financial datasets +spanning stocks, cryptocurrencies, and forex assets. Key findings demonstrate +candlestick patterns do not improve model performance over only image data in +our research. The significance is illuminating limitations in candlestick image +signals. Performance peaked at approximately 0.7 accuracy, below more complex +time-series models. Outcomes reveal challenges in distilling sufficient +predictive power from visual shapes alone, motivating the incorporation of +other data modalities. This research clarifies how purely image-based models +can inform trading while confirming patterns add little value over raw charts. +Our content is endeavored to be delineated into distinct sections, each +autonomously furnishing a unique contribution while maintaining cohesive +linkage. Note that, the examples discussed herein are not limited to the scope, +applicability, or knowledge outlined in the paper. + +
+
+ comment: ACMLC 2025; 8 pages +
+
+
+
+
+ + ☆ DLEN: Dual Branch of Transformer for Low-Light Image Enhancement in Dual + Domains + + +
+ Low-light image enhancement (LLE) aims to improve the visual quality of +images captured in poorly lit conditions, which often suffer from low +brightness, low contrast, noise, and color distortions. These issues hinder the +performance of computer vision tasks such as object detection, facial +recognition, and autonomous driving.Traditional enhancement techniques, such as +multi-scale fusion and histogram equalization, fail to preserve fine details +and often struggle with maintaining the natural appearance of enhanced images +under complex lighting conditions. Although the Retinex theory provides a +foundation for image decomposition, it often amplifies noise, leading to +suboptimal image quality. In this paper, we propose the Dual Light Enhance +Network (DLEN), a novel architecture that incorporates two distinct attention +mechanisms, considering both spatial and frequency domains. Our model +introduces a learnable wavelet transform module in the illumination estimation +phase, preserving high- and low-frequency components to enhance edge and +texture details. Additionally, we design a dual-branch structure that leverages +the power of the Transformer architecture to enhance both the illumination and +structural components of the image.Through extensive experiments, our model +outperforms state-of-the-art methods on standard benchmarks.Code is available +here: https://github.com/LaLaLoXX/DLEN + +
+
+ comment: 10pages,6figures +
+
+
+
+
+ + ☆ InsTALL: Context-aware Instructional Task Assistance with Multi-modal + Large Language Models + + +
+ The improved competence of generative models can help building multi-modal +virtual assistants that leverage modalities beyond language. By observing +humans performing multi-step tasks, one can build assistants that have +situational awareness of actions and tasks being performed, enabling them to +cater assistance based on this understanding. In this paper, we develop a +Context-aware Instructional Task Assistant with Multi-modal Large Language +Models (InsTALL) that leverages an online visual stream (e.g. a user's screen +share or video recording) and responds in real-time to user queries related to +the task at hand. To enable useful assistance, InsTALL 1) trains a multi-modal +model on task videos and paired textual data, and 2) automatically extracts +task graph from video data and leverages it at training and inference time. We +show InsTALL achieves state-of-the-art performance across proposed sub-tasks +considered for multimodal activity understanding -- task recognition (TR), +action recognition (AR), next action prediction (AP), and plan prediction (PP) +-- and outperforms existing baselines on two novel sub-tasks related to +automatic error identification. + +
+
+
+
+
+ + ☆ TokenVerse: Versatile Multi-concept Personalization in Token Modulation + Space + + +
+ We present TokenVerse -- a method for multi-concept personalization, +leveraging a pre-trained text-to-image diffusion model. Our framework can +disentangle complex visual elements and attributes from as little as a single +image, while enabling seamless plug-and-play generation of combinations of +concepts extracted from multiple images. As opposed to existing works, +TokenVerse can handle multiple images with multiple concepts each, and supports +a wide-range of concepts, including objects, accessories, materials, pose, and +lighting. Our work exploits a DiT-based text-to-image model, in which the input +text affects the generation through both attention and modulation (shift and +scale). We observe that the modulation space is semantic and enables localized +control over complex concepts. Building on this insight, we devise an +optimization-based framework that takes as input an image and a text +description, and finds for each word a distinct direction in the modulation +space. These directions can then be used to generate new images that combine +the learned concepts in a desired configuration. We demonstrate the +effectiveness of TokenVerse in challenging personalization settings, and +showcase its advantages over existing methods. project's webpage in +https://token-verse.github.io/ + +
+
+
+
+
+ + ☆ Exploring Temporally-Aware Features for Point Tracking + + +
+ Point tracking in videos is a fundamental task with applications in robotics, +video editing, and more. While many vision tasks benefit from pre-trained +feature backbones to improve generalizability, point tracking has primarily +relied on simpler backbones trained from scratch on synthetic data, which may +limit robustness in real-world scenarios. Additionally, point tracking requires +temporal awareness to ensure coherence across frames, but using +temporally-aware features is still underexplored. Most current methods often +employ a two-stage process: an initial coarse prediction followed by a +refinement stage to inject temporal information and correct errors from the +coarse stage. These approach, however, is computationally expensive and +potentially redundant if the feature backbone itself captures sufficient +temporal information. + In this work, we introduce Chrono, a feature backbone specifically designed +for point tracking with built-in temporal awareness. Leveraging pre-trained +representations from self-supervised learner DINOv2 and enhanced with a +temporal adapter, Chrono effectively captures long-term temporal context, +enabling precise prediction even without the refinement stage. Experimental +results demonstrate that Chrono achieves state-of-the-art performance in a +refiner-free setting on the TAP-Vid-DAVIS and TAP-Vid-Kinetics datasets, among +common feature backbones used in point tracking as well as DINOv2, with +exceptional efficiency. Project page: https://cvlab-kaist.github.io/Chrono/ + +
+
+
+
+
+ + ☆ Early Detection and Classification of Breast Cancer Using Deep Learning + Techniques + + +
+ Breast cancer is one of the deadliest cancers causing about massive number of +patients to die annually all over the world according to the WHO. It is a kind +of cancer that develops when the tissues of the breast grow rapidly and +unboundly. This fatality rate can be prevented if the cancer is detected before +it gets malignant. Using automation for early-age detection of breast cancer, +Artificial Intelligence and Machine Learning technologies can be implemented +for the best outcome. In this study, we are using the Breast Cancer Image +Classification dataset collected from the Kaggle depository, which comprises +9248 Breast Ultrasound Images and is classified into three categories: Benign, +Malignant, and Normal which refers to non-cancerous, cancerous, and normal +images.This research introduces three pretrained model featuring custom +classifiers that includes ResNet50, MobileNet, and VGG16, along with a custom +CNN model utilizing the ReLU activation function.The models ResNet50, +MobileNet, VGG16, and a custom CNN recorded accuracies of 98.41%, 97.91%, +98.19%, and 92.94% on the dataset, correspondingly, with ResNet50 achieving the +highest accuracy of 98.41%.This model, with its deep and powerful architecture, +is particularly successful in detecting aberrant cells as well as cancerous or +non-cancerous tumors. These accuracies show that the Machine Learning methods +are more compatible for the classification and early detection of breast +cancer. + +
+
+
+
+
+ + ☆ RL-RC-DoT: A Block-level RL agent for Task-Aware Video Compression + + +
+ Video encoders optimize compression for human perception by minimizing +reconstruction error under bit-rate constraints. In many modern applications +such as autonomous driving, an overwhelming majority of videos serve as input +for AI systems performing tasks like object recognition or segmentation, rather +than being watched by humans. It is therefore useful to optimize the encoder +for a downstream task instead of for perceptual image quality. However, a major +challenge is how to combine such downstream optimization with existing standard +video encoders, which are highly efficient and popular. Here, we address this +challenge by controlling the Quantization Parameters (QPs) at the macro-block +level to optimize the downstream task. This granular control allows us to +prioritize encoding for task-relevant regions within each frame. We formulate +this optimization problem as a Reinforcement Learning (RL) task, where the +agent learns to balance long-term implications of choosing QPs on both task +performance and bit-rate constraints. Notably, our policy does not require the +downstream task as an input during inference, making it suitable for streaming +applications and edge devices such as vehicles. We demonstrate significant +improvements in two tasks, car detection, and ROI (saliency) encoding. Our +approach improves task performance for a given bit rate compared to traditional +task agnostic encoding methods, paving the way for more efficient task-aware +video compression. + +
+
+
+
+
+ + ☆ Fixing Imbalanced Attention to Mitigate In-Context Hallucination of + Large Vision-Language Model + + +
+ Large Vision Language Models (LVLMs) have demonstrated remarkable +capabilities in understanding and describing visual content, achieving +state-of-the-art performance across various vision-language tasks. However, +these models frequently exhibit hallucination behavior, where they generate +descriptions containing objects or details absent in the input image. Our work +investigates this phenomenon by analyzing attention patterns across transformer +layers and heads, revealing that hallucinations often stem from progressive +degradation of visual grounding in deeper layers. We propose a novel attention +modification approach that combines selective token emphasis and head-specific +modulation to maintain visual grounding throughout the generation process. Our +method introduces two key components: (1) a dual-stream token selection +mechanism that identifies and prioritizes both locally informative and +spatially significant visual tokens, and (2) an attention head-specific +modulation strategy that differentially amplifies visual information processing +based on measured visual sensitivity of individual attention heads. Through +extensive experimentation on the MSCOCO dataset, we demonstrate that our +approach reduces hallucination rates by up to 62.3\% compared to baseline +models while maintaining comparable task performance. Our analysis reveals that +selectively modulating tokens across attention heads with varying levels of +visual sensitivity can significantly improve visual grounding without requiring +model retraining. + +
+
+ comment: 10 pages, 5 tables, 4 figures +
+
+
+
+
+ + ☆ Explainability for Vision Foundation Models: A Survey + + +
+ As artificial intelligence systems become increasingly integrated into daily +life, the field of explainability has gained significant attention. This trend +is particularly driven by the complexity of modern AI models and their +decision-making processes. The advent of foundation models, characterized by +their extensive generalization capabilities and emergent uses, has further +complicated this landscape. Foundation models occupy an ambiguous position in +the explainability domain: their complexity makes them inherently challenging +to interpret, yet they are increasingly leveraged as tools to construct +explainable models. In this survey, we explore the intersection of foundation +models and eXplainable AI (XAI) in the vision domain. We begin by compiling a +comprehensive corpus of papers that bridge these fields. Next, we categorize +these works based on their architectural characteristics. We then discuss the +challenges faced by current research in integrating XAI within foundation +models. Furthermore, we review common evaluation methodologies for these +combined approaches. Finally, we present key observations and insights from our +survey, offering directions for future research in this rapidly evolving field. + +
+
+
+
+
+ + ☆ Hunyuan3D 2.0: Scaling Diffusion Models for High Resolution Textured 3D + Assets Generation + + +
+ We present Hunyuan3D 2.0, an advanced large-scale 3D synthesis system for +generating high-resolution textured 3D assets. This system includes two +foundation components: a large-scale shape generation model -- Hunyuan3D-DiT, +and a large-scale texture synthesis model -- Hunyuan3D-Paint. The shape +generative model, built on a scalable flow-based diffusion transformer, aims to +create geometry that properly aligns with a given condition image, laying a +solid foundation for downstream applications. The texture synthesis model, +benefiting from strong geometric and diffusion priors, produces high-resolution +and vibrant texture maps for either generated or hand-crafted meshes. +Furthermore, we build Hunyuan3D-Studio -- a versatile, user-friendly production +platform that simplifies the re-creation process of 3D assets. It allows both +professional and amateur users to manipulate or even animate their meshes +efficiently. We systematically evaluate our models, showing that Hunyuan3D 2.0 +outperforms previous state-of-the-art models, including the open-source models +and closed-source models in geometry details, condition alignment, texture +quality, and etc. Hunyuan3D 2.0 is publicly released in order to fill the gaps +in the open-source 3D community for large-scale foundation generative models. +The code and pre-trained weights of our models are available at: +https://github.com/Tencent/Hunyuan3D-2 + +
+
+ comment: GitHub link: https://github.com/Tencent/Hunyuan3D-2 +
+
+
+
+
+ + ☆ A margin-based replacement for cross-entropy loss + + +
+ Cross-entropy (CE) loss is the de-facto standard for training deep neural +networks to perform classification. However, CE-trained deep neural networks +struggle with robustness and generalisation issues. To alleviate these issues, +we propose high error margin (HEM) loss, a variant of multi-class margin loss +that overcomes the training issues of other margin-based losses. We evaluate +HEM extensively on a range of architectures and datasets. We find that HEM loss +is more effective than cross-entropy loss across a wide range of tasks: unknown +class rejection, adversarial robustness, learning with imbalanced data, +continual learning, and semantic segmentation (a pixel-level classification +task). Despite all training hyper-parameters being chosen for CE loss, HEM is +inferior to CE only in terms of clean accuracy and this difference is +insignificant. We also compare HEM to specialised losses that have previously +been proposed to improve performance on specific tasks. LogitNorm, a loss +achieving state-of-the-art performance on unknown class rejection, produces +similar performance to HEM for this task, but is much poorer for continual +learning and semantic segmentation. Logit-adjusted loss, designed for +imbalanced data, has superior results to HEM for that task, but performs more +poorly on unknown class rejection and semantic segmentation. DICE, a popular +loss for semantic segmentation, is inferior to HEM loss on all tasks, including +semantic segmentation. Thus, HEM often out-performs specialised losses, and in +contrast to them, is a general-purpose replacement for CE loss. + +
+
+ comment: Code: https://codeberg.org/mwspratling/HEMLoss +
+
+
+
+
+ + ☆ High-dimensional multimodal uncertainty estimation by manifold + alignment:Application to 3D right ventricular strain computations + + +
+ Confidence in the results is a key ingredient to improve the adoption of +machine learning methods by clinicians. Uncertainties on the results have been +considered in the literature, but mostly those originating from the learning +and processing methods. Uncertainty on the data is hardly challenged, as a +single sample is often considered representative enough of each subject +included in the analysis. In this paper, we propose a representation learning +strategy to estimate local uncertainties on a physiological descriptor (here, +myocardial deformation) previously obtained from medical images by different +definitions or computations. We first use manifold alignment to match the +latent representations associated to different high-dimensional input +descriptors. Then, we formulate plausible distributions of latent +uncertainties, and finally exploit them to reconstruct uncertainties on the +input high-dimensional descriptors. We demonstrate its relevance for the +quantification of myocardial deformation (strain) from 3D echocardiographic +image sequences of the right ventricle, for which a lack of consensus exists in +its definition and which directional component to use. We used a database of +100 control subjects with right ventricle overload, for which different types +of strain are available at each point of the right ventricle endocardial +surface mesh. Our approach quantifies local uncertainties on myocardial +deformation from different descriptors defining this physiological concept. +Such uncertainties cannot be directly estimated by local statistics on such +descriptors, potentially of heterogeneous types. Beyond this controlled +illustrative application, our methodology has the potential to be generalized +to many other population analyses considering heterogeneous high-dimensional +descriptors. + +
+
+
+
+
+ + ☆ ComposeAnyone: Controllable Layout-to-Human Generation with Decoupled + Multimodal Conditions + + +
+ Building on the success of diffusion models, significant advancements have +been made in multimodal image generation tasks. Among these, human image +generation has emerged as a promising technique, offering the potential to +revolutionize the fashion design process. However, existing methods often focus +solely on text-to-image or image reference-based human generation, which fails +to satisfy the increasingly sophisticated demands. To address the limitations +of flexibility and precision in human generation, we introduce ComposeAnyone, a +controllable layout-to-human generation method with decoupled multimodal +conditions. Specifically, our method allows decoupled control of any part in +hand-drawn human layouts using text or reference images, seamlessly integrating +them during the generation process. The hand-drawn layout, which utilizes +color-blocked geometric shapes such as ellipses and rectangles, can be easily +drawn, offering a more flexible and accessible way to define spatial layouts. +Additionally, we introduce the ComposeHuman dataset, which provides decoupled +text and reference image annotations for different components of each human +image, enabling broader applications in human image generation tasks. Extensive +experiments on multiple datasets demonstrate that ComposeAnyone generates human +images with better alignment to given layouts, text descriptions, and reference +images, showcasing its multi-task capability and controllability. + +
+
+
+
+
+ + ☆ SVGS-DSGAT: An IoT-Enabled Innovation in Underwater Robotic Object + Detection Technology + + +
+ With the advancement of Internet of Things (IoT) technology, underwater +target detection and tracking have become increasingly important for ocean +monitoring and resource management. Existing methods often fall short in +handling high-noise and low-contrast images in complex underwater environments, +lacking precision and robustness. This paper introduces a novel SVGS-DSGAT +model that combines GraphSage, SVAM, and DSGAT modules, enhancing feature +extraction and target detection capabilities through graph neural networks and +attention mechanisms. The model integrates IoT technology to facilitate +real-time data collection and processing, optimizing resource allocation and +model responsiveness. Experimental results demonstrate that the SVGS-DSGAT +model achieves an mAP of 40.8% on the URPC 2020 dataset and 41.5% on the +SeaDronesSee dataset, significantly outperforming existing mainstream models. +This IoT-enhanced approach not only excels in high-noise and complex +backgrounds but also improves the overall efficiency and scalability of the +system. This research provides an effective IoT solution for underwater target +detection technology, offering significant practical application value and +broad development prospects. + +
+
+ comment: 17 pages, 8 figures +
+
+
+
+
+ + ☆ Fast-RF-Shimming: Accelerate RF Shimming in 7T MRI using Deep Learning + + +
+ Ultrahigh field (UHF) Magnetic Resonance Imaging (MRI) provides a high +signal-to-noise ratio (SNR), enabling exceptional spatial resolution for +clinical diagnostics and research. However, higher fields introduce challenges +such as transmit radiofrequency (RF) field inhomogeneities, which result in +uneven flip angles and image intensity artifacts. These artifacts degrade image +quality and limit clinical adoption. Traditional RF shimming methods, including +Magnitude Least Squares (MLS) optimization, mitigate RF field inhomogeneity but +are time-intensive and often require the presence of the patient. Recent +machine learning methods, such as RF Shim Prediction by Iteratively Projected +Ridge Regression and other deep learning architectures, offer alternative +approaches but face challenges such as extensive training requirements, limited +complexity, and practical data constraints. This paper introduces a holistic +learning-based framework called Fast RF Shimming, which achieves a 5000-fold +speedup compared to MLS methods. First, random-initialized Adaptive Moment +Estimation (Adam) derives reference shimming weights from multichannel RF +fields. Next, a Residual Network (ResNet) maps RF fields to shimming outputs +while incorporating a confidence parameter into the loss function. Finally, a +Non-uniformity Field Detector (NFD) identifies extreme non-uniform outcomes. +Comparative evaluations demonstrate significant improvements in both speed and +predictive accuracy. The proposed pipeline also supports potential extensions, +such as the integration of anatomical priors or multi-echo data, to enhance the +robustness of RF field correction. This approach offers a faster and more +efficient solution to RF shimming challenges in UHF MRI. + +
+
+
+
+
+ + ☆ DNRSelect: Active Best View Selection for Deferred Neural Rendering ICRA 2025 + + +
+ Deferred neural rendering (DNR) is an emerging computer graphics pipeline +designed for high-fidelity rendering and robotic perception. However, DNR +heavily relies on datasets composed of numerous ray-traced images and demands +substantial computational resources. It remains under-explored how to reduce +the reliance on high-quality ray-traced images while maintaining the rendering +fidelity. In this paper, we propose DNRSelect, which integrates a reinforcement +learning-based view selector and a 3D texture aggregator for deferred neural +rendering. We first propose a novel view selector for deferred neural rendering +based on reinforcement learning, which is trained on easily obtained rasterized +images to identify the optimal views. By acquiring only a few ray-traced images +for these selected views, the selector enables DNR to achieve high-quality +rendering. To further enhance spatial awareness and geometric consistency in +DNR, we introduce a 3D texture aggregator that fuses pyramid features from +depth maps and normal maps with UV maps. Given that acquiring ray-traced images +is more time-consuming than generating rasterized images, DNRSelect minimizes +the need for ray-traced data by using only a few selected views while still +achieving high-fidelity rendering results. We conduct detailed experiments and +ablation studies on the NeRF-Synthetic dataset to demonstrate the effectiveness +of DNRSelect. The code will be released. + +
+
+ comment: 7 pages, 8 figures, submitted to ICRA 2025 +
+
+
+
+
+ + ☆ ENTIRE: Learning-based Volume Rendering Time Prediction + + +
+ We present ENTIRE, a novel approach for volume rendering time prediction. +Time-dependent volume data from simulations or experiments typically comprise +complex deforming structures across hundreds or thousands of time steps, which +in addition to the camera configuration has a significant impact on rendering +performance. We first extract a feature vector from a volume that captures its +structure that is relevant for rendering time performance. Then we combine this +feature vector with further relevant parameters (e.g. camera setup), and with +this perform the final prediction. Our experiments conducted on various +datasets demonstrate that our model is capable of efficiently achieving high +prediction accuracy with fast response rates. We showcase ENTIRE's capability +of enabling dynamic parameter adaptation for stable frame rates and load +balancing in two case studies. + +
+
+
+
+
+ + ☆ Meta-Sparsity: Learning Optimal Sparse Structures in Multi-task Networks + through Meta-learning + + +
+ This paper presents meta-sparsity, a framework for learning model sparsity, +basically learning the parameter that controls the degree of sparsity, that +allows deep neural networks (DNNs) to inherently generate optimal sparse shared +structures in multi-task learning (MTL) setting. This proposed approach enables +the dynamic learning of sparsity patterns across a variety of tasks, unlike +traditional sparsity methods that rely heavily on manual hyperparameter tuning. +Inspired by Model Agnostic Meta-Learning (MAML), the emphasis is on learning +shared and optimally sparse parameters in multi-task scenarios by implementing +a penalty-based, channel-wise structured sparsity during the meta-training +phase. This method improves the model's efficacy by removing unnecessary +parameters and enhances its ability to handle both seen and previously unseen +tasks. The effectiveness of meta-sparsity is rigorously evaluated by extensive +experiments on two datasets, NYU-v2 and CelebAMask-HQ, covering a broad +spectrum of tasks ranging from pixel-level to image-level predictions. The +results show that the proposed approach performs well across many tasks, +indicating its potential as a versatile tool for creating efficient and +adaptable sparse neural networks. This work, therefore, presents an approach +towards learning sparsity, contributing to the efforts in the field of sparse +neural networks and suggesting new directions for research towards parsimonious +models. + +
+
+
+
+
+ + ☆ Teacher Encoder-Student Decoder Denoising Guided Segmentation Network + for Anomaly Detection + + +
+ Visual anomaly detection is a highly challenging task, often categorized as a +one-class classification and segmentation problem. Recent studies have +demonstrated that the student-teacher (S-T) framework effectively addresses +this challenge. However, most S-T frameworks rely solely on pre-trained teacher +networks to guide student networks in learning multi-scale similar features, +overlooking the potential of the student networks to enhance learning through +multi-scale feature fusion. In this study, we propose a novel model named +PFADSeg, which integrates a pre-trained teacher network, a denoising student +network with multi-scale feature fusion, and a guided anomaly segmentation +network into a unified framework. By adopting a unique teacher-encoder and +student-decoder denoising mode, the model improves the student network's +ability to learn from teacher network features. Furthermore, an adaptive +feature fusion mechanism is introduced to train a self-supervised segmentation +network that synthesizes anomaly masks autonomously, significantly increasing +detection performance. Evaluated on the MVTec AD dataset, PFADSeg achieves +state-of-the-art results with an image-level AUC of 98.9%, a pixel-level mean +precision of 76.4%, and an instance-level mean precision of 78.7%. + +
+
+
+
+
+ + ☆ Proxies for Distortion and Consistency with Applications for Real-World + Image Restoration + + +
+ Real-world image restoration deals with the recovery of images suffering from +an unknown degradation. This task is typically addressed while being given only +degraded images, without their corresponding ground-truth versions. In this +hard setting, designing and evaluating restoration algorithms becomes highly +challenging. This paper offers a suite of tools that can serve both the design +and assessment of real-world image restoration algorithms. Our work starts by +proposing a trained model that predicts the chain of degradations a given +real-world measured input has gone through. We show how this estimator can be +used to approximate the consistency -- the match between the measurements and +any proposed recovered image. We also use this estimator as a guiding force for +the design of a simple and highly-effective plug-and-play real-world image +restoration algorithm, leveraging a pre-trained diffusion-based image prior. +Furthermore, this work proposes no-reference proxy measures of MSE and LPIPS, +which, without access to the ground-truth images, allow ranking of real-world +image restoration algorithms according to their (approximate) MSE and LPIPS. +The proposed suite provides a versatile, first of its kind framework for +evaluating and comparing blind image restoration algorithms in real-world +scenarios. + +
+
+ comment: Project page in https://man-sean.github.io/elad-website/ +
+
+
+
+
+ + ☆ UAV-Assisted Real-Time Disaster Detection Using Optimized Transformer + Model + + +
+ Disaster recovery and management present significant challenges, particularly +in unstable environments and hard-to-reach terrains. These difficulties can be +overcome by employing unmanned aerial vehicles (UAVs) equipped with onboard +embedded platforms and camera sensors. In this work, we address the critical +need for accurate and timely disaster detection by enabling onboard aerial +imagery processing and avoiding connectivity, privacy, and latency issues +despite the challenges posed by limited onboard hardware resources. We propose +a UAV-assisted edge framework for real-time disaster management, leveraging our +proposed model optimized for real-time aerial image classification. The +optimization of the model employs post-training quantization techniques. For +real-world disaster scenarios, we introduce a novel dataset, DisasterEye, +featuring UAV-captured disaster scenes as well as ground-level images taken by +individuals on-site. Experimental results demonstrate the effectiveness of our +model, achieving high accuracy with reduced inference latency and memory usage +on resource-constrained devices. The framework's scalability and adaptability +make it a robust solution for real-time disaster detection on resource-limited +UAV platforms. + +
+
+
+
+
+ + ☆ DSTSA-GCN: Advancing Skeleton-Based Gesture Recognition with + Semantic-Aware Spatio-Temporal Topology Modeling + + +
+ Graph convolutional networks (GCNs) have emerged as a powerful tool for +skeleton-based action and gesture recognition, thanks to their ability to model +spatial and temporal dependencies in skeleton data. However, existing GCN-based +methods face critical limitations: (1) they lack effective spatio-temporal +topology modeling that captures dynamic variations in skeletal motion, and (2) +they struggle to model multiscale structural relationships beyond local joint +connectivity. To address these issues, we propose a novel framework called +Dynamic Spatial-Temporal Semantic Awareness Graph Convolutional Network +(DSTSA-GCN). DSTSA-GCN introduces three key modules: Group Channel-wise Graph +Convolution (GC-GC), Group Temporal-wise Graph Convolution (GT-GC), and +Multi-Scale Temporal Convolution (MS-TCN). GC-GC and GT-GC operate in parallel +to independently model channel-specific and frame-specific correlations, +enabling robust topology learning that accounts for temporal variations. +Additionally, both modules employ a grouping strategy to adaptively capture +multiscale structural relationships. Complementing this, MS-TCN enhances +temporal modeling through group-wise temporal convolutions with diverse +receptive fields. Extensive experiments demonstrate that DSTSA-GCN +significantly improves the topology modeling capabilities of GCNs, achieving +state-of-the-art performance on benchmark datasets for gesture and action +recognition, including SHREC17 Track, DHG-14\/28, NTU-RGB+D, and NTU-RGB+D-120. + +
+
+ comment: submit to Neurocomputing +
+
+
+
+
+ + ☆ Scalable Whole Slide Image Representation Using K-Mean Clustering and + Fisher Vector Aggregation + + +
+ Whole slide images (WSIs) are high-resolution, gigapixel sized images that +pose significant computational challenges for traditional machine learning +models due to their size and heterogeneity.In this paper, we present a scalable +and efficient methodology for WSI classification by leveraging patch-based +feature extraction, clustering, and Fisher vector encoding. Initially, WSIs are +divided into fixed size patches, and deep feature embeddings are extracted from +each patch using a pre-trained convolutional neural network (CNN). These +patch-level embeddings are subsequently clustered using K-means clustering, +where each cluster aggregates semantically similar regions of the WSI. To +effectively summarize each cluster, Fisher vector representations are computed +by modeling the distribution of patch embeddings in each cluster as a +parametric Gaussian mixture model (GMM). The Fisher vectors from each cluster +are concatenated into a high-dimensional feature vector, creating a compact and +informative representation of the entire WSI. This feature vector is then used +by a classifier to predict the WSI's diagnostic label. Our method captures +local and global tissue structures and yields robust performance for +large-scale WSI classification, demonstrating superior accuracy and scalability +compared to other approaches. + +
+
+
+
+
+ + ☆ A Multi-annotated and Multi-modal Dataset for Wide-angle Video Quality + Assessment + + +
+ Wide-angle video is favored for its wide viewing angle and ability to capture +a large area of scenery, making it an ideal choice for sports and adventure +recording. However, wide-angle video is prone to deformation, exposure and +other distortions, resulting in poor video quality and affecting the perception +and experience, which may seriously hinder its application in fields such as +competitive sports. Up to now, few explorations focus on the quality assessment +issue of wide-angle video. This deficiency primarily stems from the absence of +a specialized dataset for wide-angle videos. To bridge this gap, we construct +the first Multi-annotated and multi-modal Wide-angle Video quality assessment +(MWV) dataset. Then, the performances of state-of-the-art video quality methods +on the MWV dataset are investigated by inter-dataset testing and intra-dataset +testing. Experimental results show that these methods impose significant +limitations on their applicability. + +
+
+
+
+
+ + ☆ Towards autonomous photogrammetric forest inventory using a lightweight + under-canopy robotic drone + + +
+ Drones are increasingly used in forestry to capture high-resolution remote +sensing data. While operations above the forest canopy are already highly +automated, flying inside forests remains challenging, primarily relying on +manual piloting. Inside dense forests, reliance on the Global Navigation +Satellite System (GNSS) for localization is not feasible. Additionally, the +drone must autonomously adjust its flight path to avoid collisions. Recently, +advancements in robotics have enabled autonomous drone flights in GNSS-denied +obstacle-rich areas. In this article, a step towards autonomous forest data +collection is taken by building a prototype of a robotic under-canopy drone +utilizing state-of-the-art open-source methods and validating its performance +for data collection inside forests. The autonomous flight capability was +evaluated through multiple test flights in two boreal forest test sites. The +tree parameter estimation capability was studied by conducting diameter at +breast height (DBH) estimation using onboard stereo camera data and +photogrammetric methods. The prototype conducted flights in selected +challenging forest environments, and the experiments showed excellent +performance in forest reconstruction with a miniaturized stereoscopic +photogrammetric system. The stem detection algorithm managed to identify 79.31 +% of the stems. The DBH estimation had a root mean square error (RMSE) of 3.33 +cm (12.79 %) and a bias of 1.01 cm (3.87 %) across all trees. For trees with a +DBH less than 30 cm, the RMSE was 1.16 cm (5.74 %), and the bias was 0.13 cm +(0.64 %). When considering the overall performance in terms of DBH accuracy, +autonomy, and forest complexity, the proposed approach was superior compared to +methods proposed in the scientific literature. Results provided valuable +insights into autonomous forest reconstruction using drones, and several +further development topics were proposed. + +
+
+ comment: 35 pages, 13 Figures +
+
+
+
+
+ + ☆ Co-Paced Learning Strategy Based on Confidence for Flying Bird Object + Detection Model Training + + +
+ To mitigate the adverse effects of hard samples on the training of the Flying +Bird Object Detection (FBOD) model for surveillance videos, we propose a +Co-Paced Learning Based on Confidence (CPL-BC) strategy and apply this strategy +to the training process of the FBOD model. This strategy involves maintaining +two models with identical structures but different initial parameter +configurations, which collaborate with each other to select easy samples with +prediction confidence exceeding a set threshold for training. As training +progresses, the strategy gradually lowers the threshold, allowing more samples +to participate, enhancing the model's ability to recognize objects from easy to +hard. Before applying the CPL-BC strategy to train the FBOD models, we +initially trained the two FBOD models to equip them with the capability to +assess the difficulty level of flying bird object samples. Experimental results +on two different datasets of flying bird objects in surveillance videos +demonstrate that, compared to other model learning strategies, CPL-BC +significantly improves detection accuracy, verifying the effectiveness and +advancement of this method. + +
+
+
+
+
+ + ☆ GaussianVideo: Efficient Video Representation Through 2D Gaussian + Splatting + + +
+ 3D Gaussian splats have emerged as a revolutionary, effective, learned +representation for static 3D scenes. In this work, we explore using 2D Gaussian +splats as a new primitive for representing videos. We propose GaussianVideo, an +approach to learning a set of 2D Gaussian splats that can effectively represent +video frames. GaussianVideo incorporates the following techniques: (i) To +exploit temporal redundancy among adjacent frames, which can speed up training +and improve the compression efficiency, we predict the Gaussian splats of a +frame based on its previous frame; (ii) To control the trade-offs between file +size and quality, we remove Gaussian splats with low contribution to the video +quality; (iii) To capture dynamics in videos, we randomly add Gaussian splats +to fit content with large motion or newly-appeared objects; (iv) To handle +significant changes in the scene, we detect key frames based on loss +differences during the learning process. Experiment results show that +GaussianVideo achieves good rate-distortion trade-offs, comparable to +state-of-the-art video codecs such as AV1 and VVC, and a rendering speed of +1500 fps for a 1920x1080 video. + +
+
+
+
+
+ + ☆ Unified 3D MRI Representations via Sequence-Invariant Contrastive + Learning + + +
+ Self-supervised deep learning has accelerated 2D natural image analysis but +remains difficult to translate into 3D MRI, where data are scarce and +pre-trained 2D backbones cannot capture volumetric context. We present a +sequence-invariant self-supervised framework leveraging quantitative MRI +(qMRI). By simulating multiple MRI contrasts from a single 3D qMRI scan and +enforcing consistent representations across these contrasts, we learn +anatomy-centric rather than sequence-specific features. This yields a robust 3D +encoder that performs strongly across varied tasks and protocols. Experiments +on healthy brain segmentation (IXI), stroke lesion segmentation (ARC), and MRI +denoising show significant gains over baseline SSL approaches, especially in +low-data settings (up to +8.3% Dice, +4.2 dB PSNR). Our model also generalises +effectively to unseen sites, demonstrating potential for more scalable and +clinically reliable volumetric analysis. All code and trained models are +publicly available. + +
+
+
+
+
+ + ☆ ORCAst: Operational High-Resolution Current Forecasts + + +
+ We present ORCAst, a multi-stage, multi-arm network for Operational +high-Resolution Current forecAsts over one week. Producing real-time nowcasts +and forecasts of ocean surface currents is a challenging problem due to +indirect or incomplete information from satellite remote sensing data. Entirely +trained on real satellite data and in situ measurements from drifters, our +model learns to forecast global ocean surface currents using various sources of +ground truth observations in a multi-stage learning procedure. Our multi-arm +encoder-decoder model architecture allows us to first predict sea surface +height and geostrophic currents from larger quantities of nadir and SWOT +altimetry data, before learning to predict ocean surface currents from much +more sparse in situ measurements from drifters. Training our model on specific +regions improves performance. Our model achieves stronger nowcast and forecast +performance in predicting ocean surface currents than various state-of-the-art +methods. + +
+
+
+
+
+ + ☆ Aggrotech: Leveraging Deep Learning for Sustainable Tomato Disease + Management + + +
+ Tomato crop health plays a critical role in ensuring agricultural +productivity and food security. Timely and accurate detection of diseases +affecting tomato plants is vital for effective disease management. In this +study, we propose a deep learning-based approach for Tomato Leaf Disease +Detection using two well-established convolutional neural networks (CNNs), +namely VGG19 and Inception v3. The experiment is conducted on the Tomato +Villages Dataset, encompassing images of both healthy tomato leaves and leaves +afflicted by various diseases. The VGG19 model is augmented with fully +connected layers, while the Inception v3 model is modified to incorporate a +global average pooling layer and a dense classification layer. Both models are +trained on the prepared dataset, and their performances are evaluated on a +separate test set. This research employs VGG19 and Inception v3 models on the +Tomato Villages dataset (4525 images) for tomato leaf disease detection. The +models' accuracy of 93.93% with dropout layers demonstrates their usefulness +for crop health monitoring. The paper suggests a deep learning-based strategy +that includes normalization, resizing, dataset preparation, and unique model +architectures. During training, VGG19 and Inception v3 serve as feature +extractors, with possible data augmentation and fine-tuning. Metrics like +accuracy, precision, recall, and F1 score are obtained through evaluation on a +test set and offer important insights into the strengths and shortcomings of +the model. The method has the potential for practical use in precision +agriculture and could help tomato crops prevent illness early on. + +
+
+ comment: 10 pages, 6 figures, ROC curves, confusion matrix analysis, and + classification reports +
+
+
+
+
+ + ☆ Adaptive Class Learning to Screen Diabetic Disorders in Fundus Images of + Eye + + +
+ The prevalence of ocular illnesses is growing globally, presenting a +substantial public health challenge. Early detection and timely intervention +are crucial for averting visual impairment and enhancing patient prognosis. +This research introduces a new framework called Class Extension with Limited +Data (CELD) to train a classifier to categorize retinal fundus images. The +classifier is initially trained to identify relevant features concerning +Healthy and Diabetic Retinopathy (DR) classes and later fine-tuned to adapt to +the task of classifying the input images into three classes: Healthy, DR, and +Glaucoma. This strategy allows the model to gradually enhance its +classification capabilities, which is beneficial in situations where there are +only a limited number of labeled datasets available. Perturbation methods are +also used to identify the input image characteristics responsible for +influencing the models decision-making process. We achieve an overall accuracy +of 91% on publicly available datasets. + +
+
+ comment: Accepted at International Conference on Pattern Recognition (ICPR) + 2024 +
+
+
+
+
+ + ☆ Advancing Earth Observation: A Survey on AI-Powered Image Processing in + Satellites + + +
+ Advancements in technology and reduction in it's cost have led to a +substantial growth in the quality & quantity of imagery captured by Earth +Observation (EO) satellites. This has presented a challenge to the efficacy of +the traditional workflow of transmitting this imagery to Earth for processing. +An approach to addressing this issue is to use pre-trained artificial +intelligence models to process images on-board the satellite, but this is +difficult given the constraints within a satellite's environment. This paper +provides an up-to-date and thorough review of research related to image +processing on-board Earth observation satellites. The significant constraints +are detailed along with the latest strategies to mitigate them. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ☆ Comparative Analysis of Pre-trained Deep Learning Models and DINOv2 for + Cushing's Syndrome Diagnosis in Facial Analysis + + +
+ Cushing's syndrome is a condition caused by excessive glucocorticoid +secretion from the adrenal cortex, often manifesting with moon facies and +plethora, making facial data crucial for diagnosis. Previous studies have used +pre-trained convolutional neural networks (CNNs) for diagnosing Cushing's +syndrome using frontal facial images. However, CNNs are better at capturing +local features, while Cushing's syndrome often presents with global facial +features. Transformer-based models like ViT and SWIN, which utilize +self-attention mechanisms, can better capture long-range dependencies and +global features. Recently, DINOv2, a foundation model based on visual +Transformers, has gained interest. This study compares the performance of +various pre-trained models, including CNNs, Transformer-based models, and +DINOv2, in diagnosing Cushing's syndrome. We also analyze gender bias and the +impact of freezing mechanisms on DINOv2. Our results show that +Transformer-based models and DINOv2 outperformed CNNs, with ViT achieving the +highest F1 score of 85.74%. Both the pre-trained model and DINOv2 had higher +accuracy for female samples. DINOv2 also showed improved performance when +freezing parameters. In conclusion, Transformer-based models and DINOv2 are +effective for Cushing's syndrome classification. + +
+
+
+
+
+ + ☆ Foreign object segmentation in chest x-rays through anatomy-guided shape + insertion + + +
+ In this paper, we tackle the challenge of instance segmentation for foreign +objects in chest radiographs, commonly seen in postoperative follow-ups with +stents, pacemakers, or ingested objects in children. The diversity of foreign +objects complicates dense annotation, as shown in insufficient existing +datasets. To address this, we propose the simple generation of synthetic data +through (1) insertion of arbitrary shapes (lines, polygons, ellipses) with +varying contrasts and opacities, and (2) cut-paste augmentations from a small +set of semi-automatically extracted labels. These insertions are guided by +anatomy labels to ensure realistic placements, such as stents appearing only in +relevant vessels. Our approach enables networks to segment complex structures +with minimal manually labeled data. Notably, it achieves performance comparable +to fully supervised models while using 93\% fewer manual annotations. + +
+
+
+
+
+ + ☆ On the "Illusion" of Gender Bias in Face Recognition: Explaining the + Fairness Issue Through Non-demographic Attributes + + +
+ Face recognition systems (FRS) exhibit significant accuracy differences based +on the user's gender. Since such a gender gap reduces the trustworthiness of +FRS, more recent efforts have tried to find the causes. However, these studies +make use of manually selected, correlated, and small-sized sets of facial +features to support their claims. In this work, we analyse gender bias in face +recognition by successfully extending the search domain to decorrelated +combinations of 40 non-demographic facial characteristics. First, we propose a +toolchain to effectively decorrelate and aggregate facial attributes to enable +a less-biased gender analysis on large-scale data. Second, we introduce two new +fairness metrics to measure fairness with and without context. Based on these +grounds, we thirdly present a novel unsupervised algorithm able to reliably +identify attribute combinations that lead to vanishing bias when used as filter +predicates for balanced testing datasets. The experiments show that the gender +gap vanishes when images of male and female subjects share specific attributes, +clearly indicating that the issue is not a question of biology but of the +social definition of appearance. These findings could reshape our understanding +of fairness in face biometrics and provide insights into FRS, helping to +address gender bias issues. + +
+
+
+
+
+ + ☆ Are Traditional Deep Learning Model Approaches as Effective as a + Retinal-Specific Foundation Model for Ocular and Systemic Disease Detection? + + +
+ Background: RETFound, a self-supervised, retina-specific foundation model +(FM), showed potential in downstream applications. However, its comparative +performance with traditional deep learning (DL) models remains incompletely +understood. This study aimed to evaluate RETFound against three +ImageNet-pretrained supervised DL models (ResNet50, ViT-base, SwinV2) in +detecting ocular and systemic diseases. + Methods: We fine-tuned/trained RETFound and three DL models on full datasets, +50%, 20%, and fixed sample sizes (400, 200, 100 images, with half comprising +disease cases; for each DR severity class, 100 and 50 cases were used. +Fine-tuned models were tested internally using the SEED (53,090 images) and +APTOS-2019 (3,672 images) datasets and externally validated on population-based +(BES, CIEMS, SP2, UKBB) and open-source datasets (ODIR-5k, PAPILA, GAMMA, +IDRiD, MESSIDOR-2). Model performance was compared using area under the +receiver operating characteristic curve (AUC) and Z-tests with Bonferroni +correction (P<0.05/3). + Interpretation: Traditional DL models are mostly comparable to RETFound for +ocular disease detection with large datasets. However, RETFound is superior in +systemic disease detection with smaller datasets. These findings offer valuable +insights into the respective merits and limitation of traditional models and +FMs. + +
+
+
+
+
+ + ☆ Survey on Hand Gesture Recognition from Visual Input + + +
+ Hand gesture recognition has become an important research area, driven by the +growing demand for human-computer interaction in fields such as sign language +recognition, virtual and augmented reality, and robotics. Despite the rapid +growth of the field, there are few surveys that comprehensively cover recent +research developments, available solutions, and benchmark datasets. This survey +addresses this gap by examining the latest advancements in hand gesture and 3D +hand pose recognition from various types of camera input data including RGB +images, depth images, and videos from monocular or multiview cameras, examining +the differing methodological requirements of each approach. Furthermore, an +overview of widely used datasets is provided, detailing their main +characteristics and application domains. Finally, open challenges such as +achieving robust recognition in real-world environments, handling occlusions, +ensuring generalization across diverse users, and addressing computational +efficiency for real-time applications are highlighted to guide future research +directions. By synthesizing the objectives, methodologies, and applications of +recent studies, this survey offers valuable insights into current trends, +challenges, and opportunities for future research in human hand gesture +recognition. + +
+
+
+
+
+ + ☆ SMamba: Sparse Mamba for Event-based Object Detection + + +
+ Transformer-based methods have achieved remarkable performance in event-based +object detection, owing to the global modeling ability. However, they neglect +the influence of non-event and noisy regions and process them uniformly, +leading to high computational overhead. To mitigate computation cost, some +researchers propose window attention based sparsification strategies to discard +unimportant regions, which sacrifices the global modeling ability and results +in suboptimal performance. To achieve better trade-off between accuracy and +efficiency, we propose Sparse Mamba (SMamba), which performs adaptive +sparsification to reduce computational effort while maintaining global modeling +capability. Specifically, a Spatio-Temporal Continuity Assessment module is +proposed to measure the information content of tokens and discard uninformative +ones by leveraging the spatiotemporal distribution differences between activity +and noise events. Based on the assessment results, an Information-Prioritized +Local Scan strategy is designed to shorten the scan distance between +high-information tokens, facilitating interactions among them in the spatial +dimension. Furthermore, to extend the global interaction from 2D space to 3D +representations, a Global Channel Interaction module is proposed to aggregate +channel information from a global spatial perspective. Results on three +datasets (Gen1, 1Mpx, and eTram) demonstrate that our model outperforms other +methods in both performance and efficiency. + +
+
+ comment: AAAI2025 +
+
+
+
+
+ + ☆ A Lightweight and Interpretable Deepfakes Detection Framework + + +
+ The recent realistic creation and dissemination of so-called deepfakes poses +a serious threat to social life, civil rest, and law. Celebrity defaming, +election manipulation, and deepfakes as evidence in court of law are few +potential consequences of deepfakes. The availability of open source trained +models based on modern frameworks such as PyTorch or TensorFlow, video +manipulations Apps such as FaceApp and REFACE, and economical computing +infrastructure has easen the creation of deepfakes. Most of the existing +detectors focus on detecting either face-swap, lip-sync, or puppet master +deepfakes, but a unified framework to detect all three types of deepfakes is +hardly explored. This paper presents a unified framework that exploits the +power of proposed feature fusion of hybrid facial landmarks and our novel heart +rate features for detection of all types of deepfakes. We propose novel heart +rate features and fused them with the facial landmark features to better +extract the facial artifacts of fake videos and natural variations available in +the original videos. We used these features to train a light-weight XGBoost to +classify between the deepfake and bonafide videos. We evaluated the performance +of our framework on the world leaders dataset (WLDR) that contains all types of +deepfakes. Experimental results illustrate that the proposed framework offers +superior detection performance over the comparative deepfakes detection +methods. Performance comparison of our framework against the LSTM-FCN, a +candidate of deep learning model, shows that proposed model achieves similar +results, however, it is more interpretable. + +
+
+
+
+
+ + ☆ Progressive Cross Attention Network for Flood Segmentation using + Multispectral Satellite Imagery + + +
+ In recent years, the integration of deep learning techniques with remote +sensing technology has revolutionized the way natural hazards, such as floods, +are monitored and managed. However, existing methods for flood segmentation +using remote sensing data often overlook the utility of correlative features +among multispectral satellite information. In this study, we introduce a +progressive cross attention network (ProCANet), a deep learning model that +progressively applies both self- and cross-attention mechanisms to +multispectral features, generating optimal feature combinations for flood +segmentation. The proposed model was compared with state-of-the-art approaches +using Sen1Floods11 dataset and our bespoke flood data generated for the Citarum +River basin, Indonesia. Our model demonstrated superior performance with the +highest Intersection over Union (IoU) score of 0.815. Our results in this +study, coupled with the ablation assessment comparing scenarios with and +without attention across various modalities, opens a promising path for +enhancing the accuracy of flood analysis using remote sensing technology. + +
+
+ comment: 5 pages, 4 figures, published in IEEE Geoscience and Remote Sensing + Letters +
+
+
+
+
+ + ☆ Enhancing Adversarial Transferability via Component-Wise Augmentation + Method + + +
+ Deep Neural Networks (DNNs) are highly vulnerable to adversarial examples, +which pose significant challenges in security-sensitive applications. Among +various adversarial attack strategies, input transformation-based attacks have +demonstrated remarkable effectiveness in enhancing adversarial transferability. +However, existing methods fail to diversify attention regions across models +adequately and introduce excessive information loss during transformations. In +this paper, we introduce a novel input transformation-based method, termed +Component-Wise Augmentation (CWA), designed to enhance transferability by +locally applying block-wise transformations. CWA strategically integrates +interpolation and selective rotation on individual image blocks to diversify +model attention regions while preserving semantic integrity. Extensive +experiments on the standard ImageNet dataset show that CWA consistently +outperforms state-of-the-art methods in both attack success rates and stability +across CNN- and Transformer-based models, while also demonstrating superior +performance against multiple defense methods. + +
+
+ comment: 13pages,5 figures +
+
+
+
+
+ + ☆ LASER: Lip Landmark Assisted Speaker Detection for Robustness + + +
+ Active Speaker Detection (ASD) aims to identify speaking individuals in +complex visual scenes. While humans can easily detect speech by matching lip +movements to audio, current ASD models struggle to establish this +correspondence, often misclassifying non-speaking instances when audio and lip +movements are unsynchronized. To address this limitation, we propose Lip +landmark Assisted Speaker dEtection for Robustness (LASER). Unlike models that +rely solely on facial frames, LASER explicitly focuses on lip movements by +integrating lip landmarks in training. Specifically, given a face track, LASER +extracts frame-level visual features and the 2D coordinates of lip landmarks +using a lightweight detector. These coordinates are encoded into dense feature +maps, providing spatial and structural information on lip positions. +Recognizing that landmark detectors may sometimes fail under challenging +conditions (e.g., low resolution, occlusions, extreme angles), we incorporate +an auxiliary consistency loss to align predictions from both lip-aware and +face-only features, ensuring reliable performance even when lip data is absent. +Extensive experiments across multiple datasets show that LASER outperforms +state-of-the-art models, especially in scenarios with desynchronized audio and +visuals, demonstrating robust performance in real-world video contexts. Code is +available at \url{https://github.com/plnguyen2908/LASER_ASD}. + +
+
+
+
+
+ + ☆ Contrastive Masked Autoencoders for Character-Level Open-Set Writer + Identification + + +
+ In the realm of digital forensics and document authentication, writer +identification plays a crucial role in determining the authors of documents +based on handwriting styles. The primary challenge in writer-id is the +"open-set scenario", where the goal is accurately recognizing writers unseen +during the model training. To overcome this challenge, representation learning +is the key. This method can capture unique handwriting features, enabling it to +recognize styles not previously encountered during training. Building on this +concept, this paper introduces the Contrastive Masked Auto-Encoders (CMAE) for +Character-level Open-Set Writer Identification. We merge Masked Auto-Encoders +(MAE) with Contrastive Learning (CL) to simultaneously and respectively capture +sequential information and distinguish diverse handwriting styles. +Demonstrating its effectiveness, our model achieves state-of-the-art (SOTA) +results on the CASIA online handwriting dataset, reaching an impressive +precision rate of 89.7%. Our study advances universal writer-id with a +sophisticated representation learning approach, contributing substantially to +the ever-evolving landscape of digital handwriting analysis, and catering to +the demands of an increasingly interconnected world. + +
+
+
+
+
+ + ☆ Fast Underwater Scene Reconstruction using Multi-View Stereo and + Physical Imaging + + +
+ Underwater scene reconstruction poses a substantial challenge because of the +intricate interplay between light and the medium, resulting in scattering and +absorption effects that make both depth estimation and rendering more complex. +While recent Neural Radiance Fields (NeRF) based methods for underwater scenes +achieve high-quality results by modeling and separating the scattering medium, +they still suffer from slow training and rendering speeds. To address these +limitations, we propose a novel method that integrates Multi-View Stereo (MVS) +with a physics-based underwater image formation model. Our approach consists of +two branches: one for depth estimation using the traditional cost volume +pipeline of MVS, and the other for rendering based on the physics-based image +formation model. The depth branch improves scene geometry, while the medium +branch determines the scattering parameters to achieve precise scene rendering. +Unlike traditional MVSNet methods that rely on ground-truth depth, our method +does not necessitate the use of depth truth, thus allowing for expedited +training and rendering processes. By leveraging the medium subnet to estimate +the medium parameters and combining this with a color MLP for rendering, we +restore the true colors of underwater scenes and achieve higher-fidelity +geometric representations. Experimental results show that our method enables +high-quality synthesis of novel views in scattering media, clear views +restoration by removing the medium, and outperforms existing methods in +rendering quality and training efficiency. + +
+
+
+
+
+ + ☆ FNIN: A Fourier Neural Operator-based Numerical Integration Network for + Surface-form-gradients + + +
+ Surface-from-gradients (SfG) aims to recover a three-dimensional (3D) surface +from its gradients. Traditional methods encounter significant challenges in +achieving high accuracy and handling high-resolution inputs, particularly +facing the complex nature of discontinuities and the inefficiencies associated +with large-scale linear solvers. Although recent advances in deep learning, +such as photometric stereo, have enhanced normal estimation accuracy, they do +not fully address the intricacies of gradient-based surface reconstruction. To +overcome these limitations, we propose a Fourier neural operator-based +Numerical Integration Network (FNIN) within a two-stage optimization framework. +In the first stage, our approach employs an iterative architecture for +numerical integration, harnessing an advanced Fourier neural operator to +approximate the solution operator in Fourier space. Additionally, a +self-learning attention mechanism is incorporated to effectively detect and +handle discontinuities. In the second stage, we refine the surface +reconstruction by formulating a weighted least squares problem, addressing the +identified discontinuities rationally. Extensive experiments demonstrate that +our method achieves significant improvements in both accuracy and efficiency +compared to current state-of-the-art solvers. This is particularly evident in +handling high-resolution images with complex data, achieving errors of fewer +than 0.1 mm on tested objects. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ☆ EmbodiedEval: Evaluate Multimodal LLMs as Embodied Agents + + +
+ Multimodal Large Language Models (MLLMs) have shown significant advancements, +providing a promising future for embodied agents. Existing benchmarks for +evaluating MLLMs primarily utilize static images or videos, limiting +assessments to non-interactive scenarios. Meanwhile, existing embodied AI +benchmarks are task-specific and not diverse enough, which do not adequately +evaluate the embodied capabilities of MLLMs. To address this, we propose +EmbodiedEval, a comprehensive and interactive evaluation benchmark for MLLMs +with embodied tasks. EmbodiedEval features 328 distinct tasks within 125 varied +3D scenes, each of which is rigorously selected and annotated. It covers a +broad spectrum of existing embodied AI tasks with significantly enhanced +diversity, all within a unified simulation and evaluation framework tailored +for MLLMs. The tasks are organized into five categories: navigation, object +interaction, social interaction, attribute question answering, and spatial +question answering to assess different capabilities of the agents. We evaluated +the state-of-the-art MLLMs on EmbodiedEval and found that they have a +significant shortfall compared to human level on embodied tasks. Our analysis +demonstrates the limitations of existing MLLMs in embodied capabilities, +providing insights for their future development. We open-source all evaluation +data and simulation framework at https://github.com/thunlp/EmbodiedEval. + +
+
+
+
+
+ + ☆ WaveNet-SF: A Hybrid Network for Retinal Disease Detection Based on + Wavelet Transform in the Spatial-Frequency Domain + + +
+ Retinal diseases are a leading cause of vision impairment and blindness, with +timely diagnosis being critical for effective treatment. Optical Coherence +Tomography (OCT) has become a standard imaging modality for retinal disease +diagnosis, but OCT images often suffer from issues such as speckle noise, +complex lesion shapes, and varying lesion sizes, making interpretation +challenging. In this paper, we propose a novel framework, WaveNet-SF, to +enhance retinal disease detection by integrating spatial-domain and +frequency-domain learning. The framework utilizes wavelet transforms to +decompose OCT images into low- and high-frequency components, enabling the +model to extract both global structural features and fine-grained details. To +improve lesion detection, we introduce a multi-scale wavelet spatial attention +(MSW-SA) module, which enhances the model's focus on regions of interest at +multiple scales. Additionally, a high-frequency feature compensation block +(HFFC) is incorporated to recover edge information lost during wavelet +decomposition, suppress noise, and preserve fine details crucial for lesion +detection. Our approach achieves state-of-the-art (SOTA) classification +accuracies of 97.82% and 99. 58% on the OCT-C8 and OCT2017 datasets, +respectively, surpassing existing methods. These results demonstrate the +efficacy of WaveNet-SF in addressing the challenges of OCT image analysis and +its potential as a powerful tool for retinal disease diagnosis. + +
+
+
+
+
+ + ☆ Survey on Monocular Metric Depth Estimation + + +
+ Monocular Depth Estimation (MDE) is a fundamental computer vision task +underpinning applications such as spatial understanding, 3D reconstruction, and +autonomous driving. While deep learning-based MDE methods can predict relative +depth from a single image, their lack of metric scale information often results +in scale inconsistencies, limiting their utility in downstream tasks like +visual SLAM, 3D reconstruction, and novel view synthesis. Monocular Metric +Depth Estimation (MMDE) addresses these challenges by enabling precise, +scene-scale depth inference. MMDE improves depth consistency, enhances +sequential task stability, simplifies integration into downstream applications, +and broadens practical use cases. This paper provides a comprehensive review of +depth estimation technologies, highlighting the evolution from geometry-based +methods to state-of-the-art deep learning approaches. It emphasizes +advancements in scale-agnostic methods, which are crucial for enabling +zero-shot generalization as the foundational capability for MMDE. Recent +progress in zero-shot MMDE research is explored, focusing on challenges such as +model generalization and the loss of detail at scene boundaries. Innovative +strategies to address these issues include unlabelled data augmentation, image +patching, architectural optimization, and generative techniques. These +advancements, analyzed in detail, demonstrate significant contributions to +overcoming existing limitations. Finally, this paper synthesizes recent +developments in zero-shot MMDE, identifies unresolved challenges, and outlines +future research directions. By offering a clear roadmap and cutting-edge +insights, this work aims to deepen understanding of MMDE, inspire novel +applications, and drive technological innovation. + +
+
+
+
+
+ + ☆ Data-driven Detection and Evaluation of Damages in Concrete Structures: + Using Deep Learning and Computer Vision + + +
+ Structural integrity is vital for maintaining the safety and longevity of +concrete infrastructures such as bridges, tunnels, and walls. Traditional +methods for detecting damages like cracks and spalls are labor-intensive, +time-consuming, and prone to human error. To address these challenges, this +study explores advanced data-driven techniques using deep learning for +automated damage detection and analysis. Two state-of-the-art instance +segmentation models, YOLO-v7 instance segmentation and Mask R-CNN, were +evaluated using a dataset comprising 400 images, augmented to 10,995 images +through geometric and color-based transformations to enhance robustness. The +models were trained and validated using a dataset split into 90% training set, +validation and test set 10%. Performance metrics such as precision, recall, +mean average precision (mAP@0.5), and frames per second (FPS) were used for +evaluation. YOLO-v7 achieved a superior mAP@0.5 of 96.1% and processed 40 FPS, +outperforming Mask R-CNN, which achieved a mAP@0.5 of 92.1% with a slower +processing speed of 18 FPS. The findings recommend YOLO-v7 instance +segmentation model for real-time, high-speed structural health monitoring, +while Mask R-CNN is better suited for detailed offline assessments. This study +demonstrates the potential of deep learning to revolutionize infrastructure +maintenance, offering a scalable and efficient solution for automated damage +detection. + +
+
+ comment: 17 pages, 10 figures. This study focuses on the data-driven detection + and evaluation of damages in concrete structures using deep learning and + computer vision techniques +
+
+
+
+
+ + ☆ CogMorph: Cognitive Morphing Attacks for Text-to-Image Models + + +
+ The development of text-to-image (T2I) generative models, that enable the +creation of high-quality synthetic images from textual prompts, has opened new +frontiers in creative design and content generation. However, this paper +reveals a significant and previously unrecognized ethical risk inherent in this +technology and introduces a novel method, termed the Cognitive Morphing Attack +(CogMorph), which manipulates T2I models to generate images that retain the +original core subjects but embeds toxic or harmful contextual elements. This +nuanced manipulation exploits the cognitive principle that human perception of +concepts is shaped by the entire visual scene and its context, producing images +that amplify emotional harm far beyond attacks that merely preserve the +original semantics. To address this, we first construct an imagery toxicity +taxonomy spanning 10 major and 48 sub-categories, aligned with human +cognitive-perceptual dimensions, and further build a toxicity risk matrix +resulting in 1,176 high-quality T2I toxic prompts. Based on this, our CogMorph +first introduces Cognitive Toxicity Augmentation, which develops a cognitive +toxicity knowledge base with rich external toxic representations for humans +(e.g., fine-grained visual features) that can be utilized to further guide the +optimization of adversarial prompts. In addition, we present Contextual +Hierarchical Morphing, which hierarchically extracts critical parts of the +original prompt (e.g., scenes, subjects, and body parts), and then iteratively +retrieves and fuses toxic features to inject harmful contexts. Extensive +experiments on multiple open-sourced T2I models and black-box commercial APIs +(e.g., DALLE-3) demonstrate the efficacy of CogMorph which significantly +outperforms other baselines by large margins (+20.62\% on average). + +
+
+
+
+
+ + ☆ TFLOP: Table Structure Recognition Framework with Layout Pointer + Mechanism + + +
+ Table Structure Recognition (TSR) is a task aimed at converting table images +into a machine-readable format (e.g. HTML), to facilitate other applications +such as information retrieval. Recent works tackle this problem by identifying +the HTML tags and text regions, where the latter is used for text extraction +from the table document. These works however, suffer from misalignment issues +when mapping text into the identified text regions. In this paper, we introduce +a new TSR framework, called TFLOP (TSR Framework with LayOut Pointer +mechanism), which reformulates the conventional text region prediction and +matching into a direct text region pointing problem. Specifically, TFLOP +utilizes text region information to identify both the table's structure tags +and its aligned text regions, simultaneously. Without the need for region +prediction and alignment, TFLOP circumvents the additional text region matching +stage, which requires finely-calibrated post-processing. TFLOP also employs +span-aware contrastive supervision to enhance the pointing mechanism in tables +with complex structure. As a result, TFLOP achieves the state-of-the-art +performance across multiple benchmarks such as PubTabNet, FinTabNet, and +SynthTabNet. In our extensive experiments, TFLOP not only exhibits competitive +performance but also shows promising results on industrial document TSR +scenarios such as documents with watermarks or in non-English domain. + +
+
+ comment: Published in IJCAI Proceedings 2024 +
+
+
+
+
+ + ☆ Provably effective detection of effective data poisoning attacks + + +
+ This paper establishes a mathematically precise definition of dataset +poisoning attack and proves that the very act of effectively poisoning a +dataset ensures that the attack can be effectively detected. On top of a +mathematical guarantee that dataset poisoning is identifiable by a new +statistical test that we call the Conformal Separability Test, we provide +experimental evidence that we can adequately detect poisoning attempts in the +real world. + +
+
+
+
+
+ + ☆ How Does the Spatial Distribution of Pre-training Data Affect Geospatial + Foundation Models? + + +
+ Foundation models have made rapid advances in many domains including Earth +observation, where Geospatial Foundation Models (GFMs) can help address global +challenges such as climate change, agriculture, and disaster response. Previous +work on GFMs focused on tailoring model architecture and pre-text tasks, and +did not investigate the impact of pre-training data selection on model +performance. However, recent works from other domains show that the +pre-training data distribution is an important factor influencing the +performance of the foundation models. With this motivation, our research +explores how the geographic distribution of pre-training data affects the +performance of GFMs. We evaluated several pre-training data distributions by +sampling different compositions from a global data pool. Our experiments with +two GFMs on downstream tasks indicate that balanced and globally representative +data compositions often outperform region-specific sampling, highlighting the +importance of diversity and global coverage in pre-training data. Our results +suggest that the most appropriate data sampling technique may depend on the +specific GFM architecture. These findings will support the development of +robust GFMs by incorporating quality pre-training data distributions, +ultimately improving machine learning solutions for Earth observation. + +
+
+ comment: Accepted at Good Data for Generative AI @ AAAI 2025 +
+
+
+
+
+ + ☆ Efficient Lung Ultrasound Severity Scoring Using Dedicated Feature + Extractor + + +
+ With the advent of the COVID-19 pandemic, ultrasound imaging has emerged as a +promising technique for COVID-19 detection, due to its non-invasive nature, +affordability, and portability. In response, researchers have focused on +developing AI-based scoring systems to provide real-time diagnostic support. +However, the limited size and lack of proper annotation in publicly available +ultrasound datasets pose significant challenges for training a robust AI model. +This paper proposes MeDiVLAD, a novel pipeline to address the above issue for +multi-level lung-ultrasound (LUS) severity scoring. In particular, we leverage +self-knowledge distillation to pretrain a vision transformer (ViT) without +label and aggregate frame-level features via dual-level VLAD aggregation. We +show that with minimal finetuning, MeDiVLAD outperforms conventional +fully-supervised methods in both frame- and video-level scoring, while offering +classification reasoning with exceptional quality. This superior performance +enables key applications such as the automatic identification of critical lung +pathology areas and provides a robust solution for broader medical video +classification tasks. + +
+
+ comment: Accepted by IEEE ISBI 2025 +
+
+
+
+
+ + ☆ Large-image Object Detection for Fine-grained Recognition of Punches + Patterns in Medieval Panel Painting + + +
+ The attribution of the author of an art piece is typically a laborious manual +process, usually relying on subjective evaluations of expert figures. However, +there are some situations in which quantitative features of the artwork can +support these evaluations. The extraction of these features can sometimes be +automated, for instance, with the use of Machine Learning (ML) techniques. An +example of these features is represented by repeated, mechanically impressed +patterns, called punches, present chiefly in 13th and 14th-century panel +paintings from Tuscany. Previous research in art history showcased a strong +connection between the shapes of punches and specific artists or workshops, +suggesting the possibility of using these quantitative cues to support the +attribution. In the present work, we first collect a dataset of large-scale +images of these panel paintings. Then, using YOLOv10, a recent and popular +object detection model, we train a ML pipeline to perform object detection on +the punches contained in the images. Due to the large size of the images, the +detection procedure is split across multiple frames by adopting a +sliding-window approach with overlaps, after which the predictions are combined +for the whole image using a custom non-maximal suppression routine. Our results +indicate how art historians working in the field can reliably use our method +for the identification and extraction of punches. + +
+
+
+
+
+ + ☆ Bidirectional Brain Image Translation using Transfer Learning from + Generic Pre-trained Models + + +
+ Brain imaging plays a crucial role in the diagnosis and treatment of various +neurological disorders, providing valuable insights into the structure and +function of the brain. Techniques such as magnetic resonance imaging (MRI) and +computed tomography (CT) enable non-invasive visualization of the brain, aiding +in the understanding of brain anatomy, abnormalities, and functional +connectivity. However, cost and radiation dose may limit the acquisition of +specific image modalities, so medical image synthesis can be used to generate +required medical images without actual addition. In the medical domain, where +obtaining labeled medical images is labor-intensive and expensive, addressing +data scarcity is a major challenge. Recent studies propose using transfer +learning to overcome this issue. This involves adapting pre-trained CycleGAN +models, initially trained on non-medical data, to generate realistic medical +images. In this work, transfer learning was applied to the task of MR-CT image +translation and vice versa using 18 pre-trained non-medical models, and the +models were fine-tuned to have the best result. The models' performance was +evaluated using four widely used image quality metrics: +Peak-signal-to-noise-ratio, Structural Similarity Index, Universal Quality +Index, and Visual Information Fidelity. Quantitative evaluation and qualitative +perceptual analysis by radiologists demonstrate the potential of transfer +learning in medical imaging and the effectiveness of the generic pre-trained +model. The results provide compelling evidence of the model's exceptional +performance, which can be attributed to the high quality and similarity of the +training images to actual human brain images. These results underscore the +significance of carefully selecting appropriate and representative training +images to optimize performance in brain image analysis tasks. + +
+
+ comment: 19 pages, 9 figures, 6 tables +
+
+
+
+
+ + ☆ fabSAM: A Farmland Boundary Delineation Method Based on the Segment + Anything Model + + +
+ Delineating farmland boundaries is essential for agricultural management such +as crop monitoring and agricultural census. Traditional methods using remote +sensing imagery have been efficient but limited in generalisation. The Segment +Anything Model (SAM), known for its impressive zero shot performance, has been +adapted for remote sensing tasks through prompt learning and fine tuning. Here, +we propose a SAM based farmland boundary delineation framework 'fabSAM' that +combines a Deeplabv3+ based Prompter and SAM. Also, a fine tuning strategy was +introduced to enable SAMs decoder to improve the use of prompt information. +Experimental results on the AI4Boundaries and AI4SmallFarms datasets have shown +that fabSAM has a significant improvement in farmland region identification and +boundary delineation. Compared to zero shot SAM, fabSAM surpassed it by 23.5% +and 15.1% in mIOU on the AI4Boundaries and AI4SmallFarms datasets, +respectively. For Deeplabv3+, fabSAM outperformed it by 4.9% and 12.5% in mIOU, +respectively. These results highlight the effectiveness of fabSAM, which also +means that we can more easily obtain the global farmland region and boundary +maps from open source satellite image datasets like Sentinel2. + +
+
+
+
+
+ + ☆ TOFFE -- Temporally-binned Object Flow from Events for High-speed and + Energy-Efficient Object Detection and Tracking + + +
+ Object detection and tracking is an essential perception task for enabling +fully autonomous navigation in robotic systems. Edge robot systems such as +small drones need to execute complex maneuvers at high-speeds with limited +resources, which places strict constraints on the underlying algorithms and +hardware. Traditionally, frame-based cameras are used for vision-based +perception due to their rich spatial information and simplified synchronous +sensing capabilities. However, obtaining detailed information across frames +incurs high energy consumption and may not even be required. In addition, their +low temporal resolution renders them ineffective in high-speed motion +scenarios. Event-based cameras offer a biologically-inspired solution to this +by capturing only changes in intensity levels at exceptionally high temporal +resolution and low power consumption, making them ideal for high-speed motion +scenarios. However, their asynchronous and sparse outputs are not natively +suitable with conventional deep learning methods. In this work, we propose +TOFFE, a lightweight hybrid framework for performing event-based object motion +estimation (including pose, direction, and speed estimation), referred to as +Object Flow. TOFFE integrates bio-inspired Spiking Neural Networks (SNNs) and +conventional Analog Neural Networks (ANNs), to efficiently process events at +high temporal resolutions while being simple to train. Additionally, we present +a novel event-based synthetic dataset involving high-speed object motion to +train TOFFE. Our experimental results show that TOFFE achieves 5.7x/8.3x +reduction in energy consumption and 4.6x/5.8x reduction in latency on edge +GPU(Jetson TX2)/hybrid hardware(Loihi-2 and Jetson TX2), compared to previous +event-based object detection baselines. + +
+
+ comment: 8 pages, 6 figures, 4 tables +
+
+
+
+
+ + ☆ Slot-BERT: Self-supervised Object Discovery in Surgical Video + + +
+ Object-centric slot attention is a powerful framework for unsupervised +learning of structured and explainable representations that can support +reasoning about objects and actions, including in surgical videos. While +conventional object-centric methods for videos leverage recurrent processing to +achieve efficiency, they often struggle with maintaining long-range temporal +coherence required for long videos in surgical applications. On the other hand, +fully parallel processing of entire videos enhances temporal consistency but +introduces significant computational overhead, making it impractical for +implementation on hardware in medical facilities. We present Slot-BERT, a +bidirectional long-range model that learns object-centric representations in a +latent space while ensuring robust temporal coherence. Slot-BERT scales object +discovery seamlessly to long videos of unconstrained lengths. A novel slot +contrastive loss further reduces redundancy and improves the representation +disentanglement by enhancing slot orthogonality. We evaluate Slot-BERT on +real-world surgical video datasets from abdominal, cholecystectomy, and +thoracic procedures. Our method surpasses state-of-the-art object-centric +approaches under unsupervised training achieving superior performance across +diverse domains. We also demonstrate efficient zero-shot domain adaptation to +data from diverse surgical specialties and databases. + +
+
+
+
+
+ + ☆ Owls are wise and foxes are unfaithful: Uncovering animal stereotypes in + vision-language models + + +
+ Animal stereotypes are deeply embedded in human culture and language. They +often shape our perceptions and expectations of various species. Our study +investigates how animal stereotypes manifest in vision-language models during +the task of image generation. Through targeted prompts, we explore whether +DALL-E perpetuates stereotypical representations of animals, such as "owls as +wise," "foxes as unfaithful," etc. Our findings reveal significant stereotyped +instances where the model consistently generates images aligned with cultural +biases. The current work is the first of its kind to examine animal +stereotyping in vision-language models systematically and to highlight a +critical yet underexplored dimension of bias in AI-generated visual content. + +
+
+
+
+
+ + ♻ ☆ FoundationStereo: Zero-Shot Stereo Matching + + +
+ Tremendous progress has been made in deep stereo matching to excel on +benchmark datasets through per-domain fine-tuning. However, achieving strong +zero-shot generalization - a hallmark of foundation models in other computer +vision tasks - remains challenging for stereo matching. We introduce +FoundationStereo, a foundation model for stereo depth estimation designed to +achieve strong zero-shot generalization. To this end, we first construct a +large-scale (1M stereo pairs) synthetic training dataset featuring large +diversity and high photorealism, followed by an automatic self-curation +pipeline to remove ambiguous samples. We then design a number of network +architecture components to enhance scalability, including a side-tuning feature +backbone that adapts rich monocular priors from vision foundation models to +mitigate the sim-to-real gap, and long-range context reasoning for effective +cost volume filtering. Together, these components lead to strong robustness and +accuracy across domains, establishing a new standard in zero-shot stereo depth +estimation. Project page: https://nvlabs.github.io/FoundationStereo/ + +
+
+
+
+
+ + ♻ ☆ Let There Be Light: Robust Lensless Imaging Under External Illumination + With Deep Learning + + +
+ Lensless cameras relax the design constraints of traditional cameras by +shifting image formation from analog optics to digital post-processing. While +new camera designs and applications can be enabled, lensless imaging is very +sensitive to unwanted interference (other sources, noise, etc.). In this work, +we address a prevalent noise source that has not been studied for lensless +imaging: external illumination e.g. from ambient and direct lighting. Being +robust to a variety of lighting conditions would increase the practicality and +adoption of lensless imaging. To this end, we propose multiple recovery +approaches that account for external illumination by incorporating its estimate +into the image recovery process. At the core is a physics-based reconstruction +that combines learnable image recovery and denoisers, all of whose parameters +are trained using experimentally gathered data. Compared to standard +reconstruction methods, our approach yields significant qualitative and +quantitative improvements. We open-source our implementations and a 25K dataset +of measurements under multiple lighting conditions. + +
+
+ comment: 4 pages, dataset: https://doi.org/10.57967/hf/2970, accepted to + ICASSP 2025 +
+
+
+
+
+ + ♻ ☆ LiteVAE: Lightweight and Efficient Variational Autoencoders for Latent + Diffusion Models NeurIPS 2024 + + +
+ Advances in latent diffusion models (LDMs) have revolutionized +high-resolution image generation, but the design space of the autoencoder that +is central to these systems remains underexplored. In this paper, we introduce +LiteVAE, a new autoencoder design for LDMs, which leverages the 2D discrete +wavelet transform to enhance scalability and computational efficiency over +standard variational autoencoders (VAEs) with no sacrifice in output quality. +We investigate the training methodologies and the decoder architecture of +LiteVAE and propose several enhancements that improve the training dynamics and +reconstruction quality. Our base LiteVAE model matches the quality of the +established VAEs in current LDMs with a six-fold reduction in encoder +parameters, leading to faster training and lower GPU memory requirements, while +our larger model outperforms VAEs of comparable complexity across all evaluated +metrics (rFID, LPIPS, PSNR, and SSIM). + +
+
+ comment: Published as a conference paper at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ SANER: Annotation-free Societal Attribute Neutralizer for Debiasing CLIP + + +
+ Large-scale vision-language models, such as CLIP, are known to contain +societal bias regarding protected attributes (e.g., gender, age). This paper +aims to address the problems of societal bias in CLIP. Although previous +studies have proposed to debias societal bias through adversarial learning or +test-time projecting, our comprehensive study of these works identifies two +critical limitations: 1) loss of attribute information when it is explicitly +disclosed in the input and 2) use of the attribute annotations during debiasing +process. To mitigate societal bias in CLIP and overcome these limitations +simultaneously, we introduce a simple-yet-effective debiasing method called +SANER (societal attribute neutralizer) that eliminates attribute information +from CLIP text features only of attribute-neutral descriptions. Experimental +results show that SANER, which does not require attribute annotations and +preserves original information for attribute-specific descriptions, +demonstrates superior debiasing ability than the existing methods. +Additionally, we observe that SANER does not require retraining CLIP from +scratch with the original dataset. Moreover, the debiased model can be directly +applied to the text-to-image generation model by simply replacing the text +encoder. + +
+
+
+
+
+ + ♻ ☆ Untrained Perceptual Loss for image denoising of line-like structures in + MR images + + +
+ In the acquisition of Magnetic Resonance (MR) images shorter scan times lead +to higher image noise. Therefore, automatic image denoising using deep learning +methods is of high interest. MR images containing line-like structures such as +roots or vessels yield special characteristics as they display connected +structures and yield sparse information. For this kind of data, it is important +to consider voxel neighborhoods when training a denoising network. In this +paper, we translate the Perceptual Loss to 3D data by comparing feature maps of +untrained networks in the loss function as done previously for 2D data. We +tested the performance of untrained Perceptual Loss (uPL) on 3D image denoising +of MR images displaying brain vessels (MR angiograms - MRA) and images of plant +roots in soil. We investigate the impact of various uPL characteristics such as +weight initialization, network depth, kernel size, and pooling operations on +the results. We tested the performance of the uPL loss on four Rician noise +levels using evaluation metrics such as the Structural Similarity Index Metric +(SSIM). We observe, that our uPL outperforms conventional loss functions such +as the L1 loss or a loss based on the Structural Similarity Index Metric +(SSIM). The uPL network's initialization is not important, while network depth +and pooling operations impact denoising performance. E.g. for both datasets a +network with five convolutional layers led to the best performance while a +network with more layers led to a performance drop. We also find that small uPL +networks led to better or comparable results than using large networks such as +VGG. We observe superior performance of our loss for both datasets, all noise +levels, and three network architectures. In conclusion, for images containing +line-like structures, uPL is an alternative to other loss functions for 3D +image denoising. + +
+
+
+
+
+ + ♻ ☆ VITA-1.5: Towards GPT-4o Level Real-Time Vision and Speech Interaction + + +
+ Recent Multimodal Large Language Models (MLLMs) have typically focused on +integrating visual and textual modalities, with less emphasis placed on the +role of speech in enhancing interaction. However, speech plays a crucial role +in multimodal dialogue systems, and implementing high-performance in both +vision and speech tasks remains a significant challenge due to the fundamental +modality differences. In this paper, we propose a carefully designed +multi-stage training methodology that progressively trains LLM to understand +both visual and speech information, ultimately enabling fluent vision and +speech interaction. Our approach not only preserves strong vision-language +capacity, but also enables efficient speech-to-speech dialogue capabilities +without separate ASR and TTS modules, significantly accelerating multimodal +end-to-end response speed. By comparing our method against state-of-the-art +counterparts across benchmarks for image, video, and speech tasks, we +demonstrate that our model is equipped with both strong visual and speech +capabilities, making near real-time vision and speech interaction. + +
+
+ comment: https://github.com/VITA-MLLM/VITA (2K+ Stars by now) +
+
+
+
+
+ + ♻ ☆ Multi-Scale Texture Loss for CT denoising with GANs + + +
+ Generative Adversarial Networks (GANs) have proved as a powerful framework +for denoising applications in medical imaging. However, GAN-based denoising +algorithms still suffer from limitations in capturing complex relationships +within the images. In this regard, the loss function plays a crucial role in +guiding the image generation process, encompassing how much a synthetic image +differs from a real image. To grasp highly complex and non-linear textural +relationships in the training process, this work presents a novel approach to +capture and embed multi-scale texture information into the loss function. Our +method introduces a differentiable multi-scale texture representation of the +images dynamically aggregated by a self-attention layer, thus exploiting +end-to-end gradient-based optimization. We validate our approach by carrying +out extensive experiments in the context of low-dose CT denoising, a +challenging application that aims to enhance the quality of noisy CT scans. We +utilize three publicly available datasets, including one simulated and two real +datasets. The results are promising as compared to other well-established loss +functions, being also consistent across three different GAN architectures. The +code is available at: +https://github.com/TrainLaboratory/MultiScaleTextureLoss-MSTLF + +
+
+
+
+
+ + ♻ ☆ TAB: Transformer Attention Bottlenecks enable User Intervention and + Debugging in Vision-Language Models + + +
+ Multi-head self-attention (MHSA) is a key component of Transformers, a widely +popular architecture in both language and vision. Multiple heads intuitively +enable different parallel processes over the same input. Yet, they also obscure +the attribution of each input patch to the output of a model. We propose a +novel 1-head Transformer Attention Bottleneck (TAB) layer, inserted after the +traditional MHSA architecture, to serve as an attention bottleneck for +interpretability and intervention. Unlike standard self-attention, TAB +constrains the total attention over all patches to $\in [0, 1]$. That is, when +the total attention is 0, no visual information is propagated further into the +network and the vision-language model (VLM) would default to a generic, +image-independent response. To demonstrate the advantages of TAB, we train VLMs +with TAB to perform image difference captioning. Over three datasets, our +models perform similarly to baseline VLMs in captioning but the bottleneck is +superior in localizing changes and in identifying when no changes occur. TAB is +the first architecture to enable users to intervene by editing attention, which +often produces expected outputs by VLMs. + +
+
+
+
+
+ + ♻ ☆ Beyond Specialization: Assessing the Capabilities of MLLMs in Age and + Gender Estimation + + +
+ Multimodal Large Language Models (MLLMs) have recently gained immense +popularity. Powerful commercial models like ChatGPT-4V and Gemini, as well as +open-source ones such as LLaVA, are essentially general-purpose models and are +applied to solve a wide variety of tasks, including those in computer vision. +These neural networks possess such strong general knowledge and reasoning +abilities that they have proven capable of working even on tasks for which they +were not specifically trained. We compared the capabilities of the most +powerful MLLMs to date: ShareGPT4V, ChatGPT, LLaVA-Next in a specialized task +of age and gender estimation with our state-of-the-art specialized model, +MiVOLO. We also updated MiVOLO and provide details and new metrics in this +article. This comparison has yielded some interesting results and insights +about the strengths and weaknesses of the participating models. Furthermore, we +attempted various ways to fine-tune the ShareGPT4V model for this specific +task, aiming to achieve state-of-the-art results in this particular challenge. +Although such a model would not be practical in production, as it is incredibly +expensive compared to a specialized model like MiVOLO, it could be very useful +in some tasks, like data annotation. + +
+
+
+
+
+ + ♻ ☆ FViT: A Focal Vision Transformer with Gabor Filter + + +
+ Vision transformers have achieved encouraging progress in various computer +vision tasks. A common belief is that this is attributed to the capability of +self-attention in modeling the global dependencies among feature tokens. +However, self-attention still faces several challenges in dense prediction +tasks, including high computational complexity and absence of desirable +inductive bias. To alleviate these issues, the potential advantages of +combining vision transformers with Gabor filters are revisited, and a learnable +Gabor filter (LGF) using convolution is proposed. The LGF does not rely on +self-attention, and it is used to simulate the response of fundamental cells in +the biological visual system to the input images. This encourages vision +transformers to focus on discriminative feature representations of targets +across different scales and orientations. In addition, a Bionic Focal Vision +(BFV) block is designed based on the LGF. This block draws inspiration from +neuroscience and introduces a Dual-Path Feed Forward Network (DPFFN) to emulate +the parallel and cascaded information processing scheme of the biological +visual cortex. Furthermore, a unified and efficient family of pyramid backbone +networks called Focal Vision Transformers (FViTs) is developed by stacking BFV +blocks. Experimental results indicate that FViTs demonstrate superior +performance in various vision tasks. In terms of computational efficiency and +scalability, FViTs show significant advantages compared with other +counterparts. + +
+
+ comment: This work has been submitted to Elsevier for possible publication +
+
+
+
+
+ + ♻ ☆ Evaluating the Efficacy of Cut-and-Paste Data Augmentation in Semantic + Segmentation for Satellite Imagery RSS 2024 + + +
+ Satellite imagery is crucial for tasks like environmental monitoring and +urban planning. Typically, it relies on semantic segmentation or Land Use Land +Cover (LULC) classification to categorize each pixel. Despite the advancements +brought about by Deep Neural Networks (DNNs), their performance in segmentation +tasks is hindered by challenges such as limited availability of labeled data, +class imbalance and the inherent variability and complexity of satellite +images. In order to mitigate those issues, our study explores the effectiveness +of a Cut-and-Paste augmentation technique for semantic segmentation in +satellite images. We adapt this augmentation, which usually requires labeled +instances, to the case of semantic segmentation. By leveraging the connected +components in the semantic segmentation labels, we extract instances that are +then randomly pasted during training. Using the DynamicEarthNet dataset and a +U-Net model for evaluation, we found that this augmentation significantly +enhances the mIoU score on the test set from 37.9 to 44.1. This finding +highlights the potential of the Cut-and-Paste augmentation to improve the +generalization capabilities of semantic segmentation models in satellite +imagery. + +
+
+ comment: Published in: IGARSS 2024 - 2024 IEEE International Geoscience and + Remote Sensing Symposium +
+
+
+
+
+ + ♻ ☆ CoDTS: Enhancing Sparsely Supervised Collaborative Perception with a + Dual Teacher-Student Framework + + +
+ Current collaborative perception methods often rely on fully annotated +datasets, which can be expensive to obtain in practical situations. To reduce +annotation costs, some works adopt sparsely supervised learning techniques and +generate pseudo labels for the missing instances. However, these methods fail +to achieve an optimal confidence threshold that harmonizes the quality and +quantity of pseudo labels. To address this issue, we propose an end-to-end +Collaborative perception Dual Teacher-Student framework (CoDTS), which employs +adaptive complementary learning to produce both high-quality and high-quantity +pseudo labels. Specifically, the Main Foreground Mining (MFM) module generates +high-quality pseudo labels based on the prediction of the static teacher. +Subsequently, the Supplement Foreground Mining (SFM) module ensures a balance +between the quality and quantity of pseudo labels by adaptively identifying +missing instances based on the prediction of the dynamic teacher. Additionally, +the Neighbor Anchor Sampling (NAS) module is incorporated to enhance the +representation of pseudo labels. To promote the adaptive complementary +learning, we implement a staged training strategy that trains the student and +dynamic teacher in a mutually beneficial manner. Extensive experiments +demonstrate that the CoDTS effectively ensures an optimal balance of pseudo +labels in both quality and quantity, establishing a new state-of-the-art in +sparsely supervised collaborative perception. + +
+
+ comment: AAAI 2025 (Oral) +
+
+
+
+
+ + ♻ ☆ Treatment-aware Diffusion Probabilistic Model for Longitudinal MRI + Generation and Diffuse Glioma Growth Prediction + + +
+ Diffuse gliomas are malignant brain tumors that grow widespread through the +brain. The complex interactions between neoplastic cells and normal tissue, as +well as the treatment-induced changes often encountered, make glioma tumor +growth modeling challenging. In this paper, we present a novel end-to-end +network capable of future predictions of tumor masks and multi-parametric +magnetic resonance images (MRI) of how the tumor will look at any future time +points for different treatment plans. Our approach is based on cutting-edge +diffusion probabilistic models and deep-segmentation neural networks. We +included sequential multi-parametric MRI and treatment information as +conditioning inputs to guide the generative diffusion process as well as a +joint segmentation process. This allows for tumor growth estimates and +realistic MRI generation at any given treatment and time point. We trained the +model using real-world postoperative longitudinal MRI data with glioma tumor +growth trajectories represented as tumor segmentation maps over time. The model +demonstrates promising performance across various tasks, including generating +high-quality multi-parametric MRI with tumor masks, performing time-series +tumor segmentations, and providing uncertainty estimates. Combined with the +treatment-aware generated MRI, the tumor growth predictions with uncertainty +estimates can provide useful information for clinical decision-making. + +
+
+ comment: preprints in the IEEE-TMI +
+
+
+
+
+ + ♻ ☆ Towards Robust and Realistic Human Pose Estimation via WiFi Signals + + +
+ Robust WiFi-based human pose estimation is a challenging task that bridges +discrete and subtle WiFi signals to human skeletons. This paper revisits this +problem and reveals two critical yet overlooked issues: 1) cross-domain gap, +i.e., due to significant variations between source-target domain pose +distributions; and 2) structural fidelity gap, i.e., predicted skeletal poses +manifest distorted topology, usually with misplaced joints and disproportionate +bone lengths. This paper fills these gaps by reformulating the task into a +novel two-phase framework dubbed DT-Pose: Domain-consistent representation +learning and Topology-constrained Pose decoding. Concretely, we first propose a +temporal-consistent contrastive learning strategy with uniformity +regularization, coupled with self-supervised masking-reconstruction operations, +to enable robust learning of domain-consistent and motion-discriminative +WiFi-specific representations. Beyond this, we introduce a simple yet effective +pose decoder with task prompts, which integrates Graph Convolution Network +(GCN) and Transformer layers to constrain the topology structure of the +generated skeleton by exploring the adjacent-overarching relationships among +human joints. Extensive experiments conducted on various benchmark datasets +highlight the superior performance of our method in tackling these fundamental +challenges in both 2D/3D human pose estimation tasks. + +
+
+ comment: 12 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ F3D-Gaus: Feed-forward 3D-aware Generation on ImageNet with + Cycle-Consistent Gaussian Splatting + + +
+ This paper tackles the problem of generalizable 3D-aware generation from +monocular datasets, e.g., ImageNet. The key challenge of this task is learning +a robust 3D-aware representation without multi-view or dynamic data, while +ensuring consistent texture and geometry across different viewpoints. Although +some baseline methods are capable of 3D-aware generation, the quality of the +generated images still lags behind state-of-the-art 2D generation approaches, +which excel in producing high-quality, detailed images. To address this severe +limitation, we propose a novel feed-forward pipeline based on pixel-aligned +Gaussian Splatting, coined as F3D-Gaus, which can produce more realistic and +reliable 3D renderings from monocular inputs. In addition, we introduce a +self-supervised cycle-consistent constraint to enforce cross-view consistency +in the learned 3D representation. This training strategy naturally allows +aggregation of multiple aligned Gaussian primitives and significantly +alleviates the interpolation limitations inherent in single-view pixel-aligned +Gaussian Splatting. Furthermore, we incorporate video model priors to perform +geometry-aware refinement, enhancing the generation of fine details in +wide-viewpoint scenarios and improving the model's capability to capture +intricate 3D textures. Extensive experiments demonstrate that our approach not +only achieves high-quality, multi-view consistent 3D-aware generation from +monocular datasets, but also significantly improves training and inference +efficiency. + +
+
+ comment: Project Page: https://w-ted.github.io/publications/F3D-Gaus +
+
+
+
+
+ + ♻ ☆ DehazeGS: Seeing Through Fog with 3D Gaussian Splatting + + +
+ Current novel view synthesis tasks primarily rely on high-quality and clear +images. However, in foggy scenes, scattering and attenuation can significantly +degrade the reconstruction and rendering quality. Although NeRF-based dehazing +reconstruction algorithms have been developed, their use of deep fully +connected neural networks and per-ray sampling strategies leads to high +computational costs. Moreover, NeRF's implicit representation struggles to +recover fine details from hazy scenes. In contrast, recent advancements in 3D +Gaussian Splatting achieve high-quality 3D scene reconstruction by explicitly +modeling point clouds into 3D Gaussians. In this paper, we propose leveraging +the explicit Gaussian representation to explain the foggy image formation +process through a physically accurate forward rendering process. We introduce +DehazeGS, a method capable of decomposing and rendering a fog-free background +from participating media using only muti-view foggy images as input. We model +the transmission within each Gaussian distribution to simulate the formation of +fog. During this process, we jointly learn the atmospheric light and scattering +coefficient while optimizing the Gaussian representation of the hazy scene. In +the inference stage, we eliminate the effects of scattering and attenuation on +the Gaussians and directly project them onto a 2D plane to obtain a clear view. +Experiments on both synthetic and real-world foggy datasets demonstrate that +DehazeGS achieves state-of-the-art performance in terms of both rendering +quality and computational efficiency. visualizations are available at +https://dehazegs.github.io/ + +
+
+ comment: 9 pages,4 figures. visualizations are available at + https://dehazegs.github.io/ +
+
+
+
+
+ + ♻ ☆ Diversify, Don't Fine-Tune: Scaling Up Visual Recognition Training with + Synthetic Images + + +
+ Recent advances in generative deep learning have enabled the creation of +high-quality synthetic images in text-to-image generation. Prior work shows +that fine-tuning a pretrained diffusion model on ImageNet and generating +synthetic training images from the finetuned model can enhance an ImageNet +classifier's performance. However, performance degrades as synthetic images +outnumber real ones. In this paper, we explore whether generative fine-tuning +is essential for this improvement and whether it is possible to further scale +up training using more synthetic data. We present a new framework leveraging +off-the-shelf generative models to generate synthetic training images, +addressing multiple challenges: class name ambiguity, lack of diversity in +naive prompts, and domain shifts. Specifically, we leverage large language +models (LLMs) and CLIP to resolve class name ambiguity. To diversify images, we +propose contextualized diversification (CD) and stylized diversification (SD) +methods, also prompted by LLMs. Finally, to mitigate domain shifts, we leverage +domain adaptation techniques with auxiliary batch normalization for synthetic +images. Our framework consistently enhances recognition model performance with +more synthetic data, up to 6x of original ImageNet size showcasing the +potential of synthetic data for improved recognition models and strong +out-of-domain generalization. + +
+
+ comment: Accepted by Transactions on Machine Learning Research (TMLR) +
+
+
+
+
+ + ♻ ☆ Implicitly Learned Neural Phase Functions for Basis-Free Point Spread + Function Engineering + + +
+ Point spread function (PSF) engineering is vital for precisely controlling +the focus of light in computational imaging, with applications in neural +imaging, fluorescence microscopy, and biophotonics. The PSF is derived from the +magnitude of the Fourier transform of a phase function, making the construction +of the phase function given the PSF (PSF engineering) an ill-posed inverse +problem. Traditional PSF engineering methods rely on physical basis functions, +limiting their ability to generalize across the range of PSFs required for +imaging tasks. We introduce a novel approach leveraging implicit neural +representations that overcome the limitations of pixel-wise optimization +methods. Our approach achieves a median MSSIM of 0.8162 and a mean MSSIM of +0.5634, compared to a median MSSIM of 0.0 and a mean MSSIM of 0.1841 with +pixel-wise optimization when learning randomly generated phase functions. Our +approach also achieves a median PSNR of 10.38 dB and a mean PSNR of 8.672 dB, +compared to a median PSNR of 6.653 dB and a mean PSNR of 6.660 dB with +pixel-wise optimization for this task. + +
+
+ comment: 3 pages, 7 figures. To be published in ICVISP 2024 + (https://www.icvisp.org/) +
+
+
+
+
+ + ♻ ☆ Evolver: Chain-of-Evolution Prompting to Boost Large Multimodal Models + for Hateful Meme Detection + + +
+ Recent advances show that two-stream approaches have achieved outstanding +performance in hateful meme detection. However, hateful memes constantly evolve +as new memes emerge by fusing progressive cultural ideas, making existing +methods obsolete or ineffective. In this work, we explore the potential of +Large Multimodal Models (LMMs) for hateful meme detection. To this end, we +propose Evolver, which incorporates LMMs via Chain-of-Evolution (CoE) +Prompting, by integrating the evolution attribute and in-context information of +memes. Specifically, Evolver simulates the evolving and expressing process of +memes and reasons through LMMs in a step-by-step manner. First, an evolutionary +pair mining module retrieves the top-k most similar memes in the external +curated meme set with the input meme. Second, an evolutionary information +extractor is designed to summarize the semantic regularities between the paired +memes for prompting. Finally, a contextual relevance amplifier enhances the +in-context hatefulness information to boost the search for evolutionary +processes. Extensive experiments on public FHM, MAMI, and HarM datasets show +that CoE prompting can be incorporated into existing LMMs to improve their +performance. More encouragingly, it can serve as an interpretive tool to +promote the understanding of the evolution of social memes. [Homepage] +(https://github.com/inFaaa/Evolver) + +
+
+ comment: accepted by COLING 2025 +
+
+
+
+
+ + ♻ ☆ FLAME: Learning to Navigate with Multimodal LLM in Urban Environments + + +
+ Large Language Models (LLMs) have demonstrated potential in +Vision-and-Language Navigation (VLN) tasks, yet current applications face +challenges. While LLMs excel in general conversation scenarios, they struggle +with specialized navigation tasks, yielding suboptimal performance compared to +specialized VLN models. We introduce FLAME (FLAMingo-Architected Embodied +Agent), a novel Multimodal LLM-based agent and architecture designed for urban +VLN tasks that efficiently handles multiple observations. Our approach +implements a three-phase tuning technique for effective adaptation to +navigation tasks, including single perception tuning for street view +description, multiple perception tuning for route summarization, and end-to-end +training on VLN datasets. The augmented datasets are synthesized automatically. +Experimental results demonstrate FLAME's superiority over existing methods, +surpassing state-of-the-art methods by a 7.3% increase in task completion on +Touchdown dataset. This work showcases the potential of Multimodal LLMs (MLLMs) +in complex navigation tasks, representing an advancement towards applications +of MLLMs in the field of embodied intelligence. + +
+
+ comment: Accepted to AAAI 2025 (Oral) +
+
+
+
+
+ + ♻ ☆ Grid: Omni Visual Generation + + +
+ Visual generation has witnessed remarkable progress in single-image tasks, +yet extending these capabilities to temporal sequences remains challenging. +Current approaches either build specialized video models from scratch with +enormous computational costs or add separate motion modules to image +generators, both requiring learning temporal dynamics anew. We observe that +modern image generation models possess underutilized potential in handling +structured layouts with implicit temporal understanding. Building on this +insight, we introduce GRID, which reformulates temporal sequences as grid +layouts, enabling holistic processing of visual sequences while leveraging +existing model capabilities. Through a parallel flow-matching training strategy +with coarse-to-fine scheduling, our approach achieves up to 67 faster inference +speeds while using <1/1000 of the computational resources compared to +specialized models. Extensive experiments demonstrate that GRID not only excels +in temporal tasks from Text-to-Video to 3D Editing but also preserves strong +performance in image generation, establishing itself as an efficient and +versatile omni-solution for visual generation. + +
+
+ comment: Codes: https://github.com/Should-AI-Lab/GRID +
+
+
+
+
+ + ♻ ☆ DCPI-Depth: Explicitly Infusing Dense Correspondence Prior to + Unsupervised Monocular Depth Estimation + + +
+ There has been a recent surge of interest in learning to perceive depth from +monocular videos in an unsupervised fashion. A key challenge in this field is +achieving robust and accurate depth estimation in challenging scenarios, +particularly in regions with weak textures or where dynamic objects are +present. This study makes three major contributions by delving deeply into +dense correspondence priors to provide existing frameworks with explicit +geometric constraints. The first novelty is a contextual-geometric depth +consistency loss, which employs depth maps triangulated from dense +correspondences based on estimated ego-motion to guide the learning of depth +perception from contextual information, since explicitly triangulated depth +maps capture accurate relative distances among pixels. The second novelty +arises from the observation that there exists an explicit, deducible +relationship between optical flow divergence and depth gradient. A differential +property correlation loss is, therefore, designed to refine depth estimation +with a specific emphasis on local variations. The third novelty is a +bidirectional stream co-adjustment strategy that enhances the interaction +between rigid and optical flows, encouraging the former towards more accurate +correspondence and making the latter more adaptable across various scenarios +under the static scene hypotheses. DCPI-Depth, a framework that incorporates +all these innovative components and couples two bidirectional and collaborative +streams, achieves state-of-the-art performance and generalizability across +multiple public datasets, outperforming all existing prior arts. Specifically, +it demonstrates accurate depth estimation in texture-less and dynamic regions, +and shows more reasonable smoothness. Our source code will be publicly +available at mias.group/DCPI-Depth upon publication. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ SEGT: A General Spatial Expansion Group Transformer for nuScenes + Lidar-based Object Detection Task + + +
+ In the technical report, we present a novel transformer-based framework for +nuScenes lidar-based object detection task, termed Spatial Expansion Group +Transformer (SEGT). To efficiently handle the irregular and sparse nature of +point cloud, we propose migrating the voxels into distinct specialized ordered +fields with the general spatial expansion strategies, and employ group +attention mechanisms to extract the exclusive feature maps within each field. +Subsequently, we integrate the feature representations across different ordered +fields by alternately applying diverse expansion strategies, thereby enhancing +the model's ability to capture comprehensive spatial information. The method +was evaluated on the nuScenes lidar-based object detection test dataset, +achieving an NDS score of 73.9 without Test-Time Augmentation (TTA) and 74.5 +with TTA, demonstrating the effectiveness of the proposed method. Notably, our +method ranks the 1st place in the nuScenes lidar-based object detection task. + +
+
+
+
+
+ + ♻ ☆ EliGen: Entity-Level Controlled Image Generation with Regional Attention + + +
+ Recent advancements in diffusion models have significantly advanced +text-to-image generation, yet global text prompts alone remain insufficient for +achieving fine-grained control over individual entities within an image. To +address this limitation, we present EliGen, a novel framework for Entity-Level +controlled Image Generation. We introduce regional attention, a mechanism for +diffusion transformers that requires no additional parameters, seamlessly +integrating entity prompts and arbitrary-shaped spatial masks. By contributing +a high-quality dataset with fine-grained spatial and semantic entity-level +annotations, we train EliGen to achieve robust and accurate entity-level +manipulation, surpassing existing methods in both spatial precision and image +quality. Additionally, we propose an inpainting fusion pipeline, extending +EliGen's capabilities to multi-entity image inpainting tasks. We further +demonstrate its flexibility by integrating it with other open-source models +such as IP-Adapter, In-Context LoRA and MLLM, unlocking new creative +possibilities. The source code, model, and dataset are published at +https://github.com/modelscope/DiffSynth-Studio. + +
+
+
+
+
+ + ♻ ☆ OmniHD-Scenes: A Next-Generation Multimodal Dataset for Autonomous + Driving + + +
+ The rapid advancement of deep learning has intensified the need for +comprehensive data for use by autonomous driving algorithms. High-quality +datasets are crucial for the development of effective data-driven autonomous +driving solutions. Next-generation autonomous driving datasets must be +multimodal, incorporating data from advanced sensors that feature extensive +data coverage, detailed annotations, and diverse scene representation. To +address this need, we present OmniHD-Scenes, a large-scale multimodal dataset +that provides comprehensive omnidirectional high-definition data. The +OmniHD-Scenes dataset combines data from 128-beam LiDAR, six cameras, and six +4D imaging radar systems to achieve full environmental perception. The dataset +comprises 1501 clips, each approximately 30-s long, totaling more than 450K +synchronized frames and more than 5.85 million synchronized sensor data points. +We also propose a novel 4D annotation pipeline. To date, we have annotated 200 +clips with more than 514K precise 3D bounding boxes. These clips also include +semantic segmentation annotations for static scene elements. Additionally, we +introduce a novel automated pipeline for generation of the dense occupancy +ground truth, which effectively leverages information from non-key frames. +Alongside the proposed dataset, we establish comprehensive evaluation metrics, +baseline models, and benchmarks for 3D detection and semantic occupancy +prediction. These benchmarks utilize surround-view cameras and 4D imaging radar +to explore cost-effective sensor solutions for autonomous driving applications. +Extensive experiments demonstrate the effectiveness of our low-cost sensor +configuration and its robustness under adverse conditions. Data will be +released at https://www.2077ai.com/OmniHD-Scenes. + +
+
+
+
+
+ + ♻ ☆ From Data Deluge to Data Curation: A Filtering-WoRA Paradigm for + Efficient Text-based Person Search + + +
+ In text-based person search endeavors, data generation has emerged as a +prevailing practice, addressing concerns over privacy preservation and the +arduous task of manual annotation. Although the number of synthesized data can +be infinite in theory, the scientific conundrum persists that how much +generated data optimally fuels subsequent model training. We observe that only +a subset of the data in these constructed datasets plays a decisive role. +Therefore, we introduce a new Filtering-WoRA paradigm, which contains a +filtering algorithm to identify this crucial data subset and WoRA (Weighted +Low-Rank Adaptation) learning strategy for light fine-tuning. The filtering +algorithm is based on the cross-modality relevance to remove the lots of coarse +matching synthesis pairs. As the number of data decreases, we do not need to +fine-tune the entire model. Therefore, we propose a WoRA learning strategy to +efficiently update a minimal portion of model parameters. WoRA streamlines the +learning process, enabling heightened efficiency in extracting knowledge from +fewer, yet potent, data instances. Extensive experimentation validates the +efficacy of pretraining, where our model achieves advanced and efficient +retrieval performance on challenging real-world benchmarks. Notably, on the +CUHK-PEDES dataset, we have achieved a competitive mAP of 67.02% while reducing +model training time by 19.82%. + +
+
+
+
+
+ + ♻ ☆ Zero-Shot Scene Change Detection + + +
+ We present a novel, training-free approach to scene change detection. Our +method leverages tracking models, which inherently perform change detection +between consecutive frames of video by identifying common objects and detecting +new or missing objects. Specifically, our method takes advantage of the change +detection effect of the tracking model by inputting reference and query images +instead of consecutive frames. Furthermore, we focus on the content gap and +style gap between two input images in change detection, and address both issues +by proposing adaptive content threshold and style bridging layers, +respectively. Finally, we extend our approach to video, leveraging rich +temporal information to enhance the performance of scene change detection. We +compare our approach and baseline through various experiments. While existing +train-based baseline tend to specialize only in the trained domain, our method +shows consistent performance across various domains, proving the +competitiveness of our approach. + +
+
+ comment: AAAI 2025. Code available at: https://github.com/kyusik-cho/ZSSCD +
+
+
+
+
+ + ♻ ☆ MambaMOT: State-Space Model as Motion Predictor for Multi-Object + Tracking + + +
+ In the field of multi-object tracking (MOT), traditional methods often rely +on the Kalman filter for motion prediction, leveraging its strengths in linear +motion scenarios. However, the inherent limitations of these methods become +evident when confronted with complex, nonlinear motions and occlusions +prevalent in dynamic environments like sports and dance. This paper explores +the possibilities of replacing the Kalman filter with a learning-based motion +model that effectively enhances tracking accuracy and adaptability beyond the +constraints of Kalman filter-based tracker. In this paper, our proposed method +MambaMOT and MambaMOT+, demonstrate advanced performance on challenging MOT +datasets such as DanceTrack and SportsMOT, showcasing their ability to handle +intricate, non-linear motion patterns and frequent occlusions more effectively +than traditional methods. + +
+
+ comment: Accepted by ICASSP 2025. Previous version paper title: Exploring + Learning-based Motion Models in Multi-Object Tracking +
+
+
+
+
+ + ♻ ☆ Robin: a Suite of Multi-Scale Vision-Language Models and the CHIRP + Evaluation Benchmark + + +
+ The proliferation of Vision-Language Models (VLMs) in the past several years +calls for rigorous and comprehensive evaluation methods and benchmarks. This +work analyzes existing VLM evaluation techniques, including automated metrics, +AI-based assessments, and human evaluations across diverse tasks. We first +introduce Robin - a novel suite of VLMs that we built by combining Large +Language Models (LLMs) and Vision Encoders (VEs) at multiple scales, and use +Robin to identify shortcomings of current evaluation approaches across scales. +Next, to overcome the identified limitations, we introduce CHIRP - a new long +form response benchmark we developed for more robust and complete VLM +evaluation. We provide open access to the Robin training code, model suite, and +CHIRP benchmark to promote reproducibility and advance VLM research. + +
+
+
+
+
+ + ♻ ☆ Exploring the Efficacy of Meta-Learning: Unveiling Superior Data + Diversity Utilization of MAML Over Pre-training + + +
+ Currently, data and model size dominate the narrative in the training of +super-large, powerful models. However, there has been a lack of exploration on +the effect of other attributes of the training dataset on model performance. We +hypothesize that dataset diversity can impact the performance of vision models. +Our study shows positive correlations between test set accuracy and data +diversity, providing an argument for furthering the research of dataset +attributes beyond size. We analyzed pre-training and model-agnostic +meta-learning methods on twelve popular visual datasets (e.g., Omniglot, +CIFAR-FS, Aircraft) and five model configurations, including MAML variants with +different numbers of inner gradient steps and supervised learning. We show +moderate to strong positive correlations (R-squared: 0.15-0.42) between +accuracy and data diversity and weaker but significant correlations (R-squared: +~0.2) between loss and diversity. These findings support our hypothesis and +demonstrate a promising way for a deeper exploration of how formal data +diversity influences model performance. This initial study highlights the +potential of (Task2Vec) data diversity as a valuable measure in the rapidly +evolving field of large-scale learning and emphasizes that understanding the +dataset is key to building more powerful and generalizable models. + +
+
+
+
+
+ + ♻ ☆ Label Convergence: Defining an Upper Performance Bound in Object + Recognition through Contradictory Annotations + + +
+ Annotation errors are a challenge not only during training of machine +learning models, but also during their evaluation. Label variations and +inaccuracies in datasets often manifest as contradictory examples that deviate +from established labeling conventions. Such inconsistencies, when significant, +prevent models from achieving optimal performance on metrics such as mean +Average Precision (mAP). We introduce the notion of "label convergence" to +describe the highest achievable performance under the constraint of +contradictory test annotations, essentially defining an upper bound on model +accuracy. + Recognizing that noise is an inherent characteristic of all data, our study +analyzes five real-world datasets, including the LVIS dataset, to investigate +the phenomenon of label convergence. We approximate that label convergence is +between 62.63-67.52 mAP@[0.5:0.95:0.05] for LVIS with 95% confidence, +attributing these bounds to the presence of real annotation errors. With +current state-of-the-art (SOTA) models at the upper end of the label +convergence interval for the well-studied LVIS dataset, we conclude that model +capacity is sufficient to solve current object detection problems. Therefore, +future efforts should focus on three key aspects: (1) updating the problem +specification and adjusting evaluation practices to account for unavoidable +label noise, (2) creating cleaner data, especially test data, and (3) including +multi-annotated data to investigate annotation variation and make these issues +visible from the outset. + +
+
+ comment: Accepted at WACV 2025, added reference to paper associated code +
+
+
+
+
+ + ♻ ☆ LatentBKI: Open-Dictionary Continuous Mapping in Visual-Language Latent + Spaces with Quantifiable Uncertainty + + +
+ This paper introduces a novel probabilistic mapping algorithm, LatentBKI, +which enables open-vocabulary mapping with quantifiable uncertainty. +Traditionally, semantic mapping algorithms focus on a fixed set of semantic +categories which limits their applicability for complex robotic tasks. +Vision-Language (VL) models have recently emerged as a technique to jointly +model language and visual features in a latent space, enabling semantic +recognition beyond a predefined, fixed set of semantic classes. LatentBKI +recurrently incorporates neural embeddings from VL models into a voxel map with +quantifiable uncertainty, leveraging the spatial correlations of nearby +observations through Bayesian Kernel Inference (BKI). LatentBKI is evaluated +against similar explicit semantic mapping and VL mapping frameworks on the +popular Matterport3D and Semantic KITTI datasets, demonstrating that LatentBKI +maintains the probabilistic benefits of continuous mapping with the additional +benefit of open-dictionary queries. Real-world experiments demonstrate +applicability to challenging indoor environments. + +
+
+
+
+
+ + ♻ ☆ Spatial Cognition from Egocentric Video: Out of Sight, Not Out of Mind 3DV 2025 + + +
+ As humans move around, performing their daily tasks, they are able to recall +where they have positioned objects in their environment, even if these objects +are currently out of their sight. In this paper, we aim to mimic this spatial +cognition ability. We thus formulate the task of Out of Sight, Not Out of Mind +- 3D tracking active objects using observations captured through an egocentric +camera. We introduce a simple but effective approach to address this +challenging problem, called Lift, Match, and Keep (LMK). LMK lifts partial 2D +observations to 3D world coordinates, matches them over time using visual +appearance, 3D location and interactions to form object tracks, and keeps these +object tracks even when they go out-of-view of the camera. We benchmark LMK on +100 long videos from EPIC-KITCHENS. Our results demonstrate that spatial +cognition is critical for correctly locating objects over short and long time +scales. E.g., for one long egocentric video, we estimate the 3D location of 50 +active objects. After 120 seconds, 57% of the objects are correctly localised +by LMK, compared to just 33% by a recent 3D method for egocentric videos and +17% by a general 2D tracking method. + +
+
+ comment: Accepted at 3DV 2025. 14 pages including references and appendix. + Project Webpage: http://dimadamen.github.io/OSNOM/ +
+
+
+
+
+ + ♻ ☆ Deep Geometric Moments Promote Shape Consistency in Text-to-3D + Generation + + +
+ To address the data scarcity associated with 3D assets, 2D-lifting techniques +such as Score Distillation Sampling (SDS) have become a widely adopted practice +in text-to-3D generation pipelines. However, the diffusion models used in these +techniques are prone to viewpoint bias and thus lead to geometric +inconsistencies such as the Janus problem. To counter this, we introduce MT3D, +a text-to-3D generative model that leverages a high-fidelity 3D object to +overcome viewpoint bias and explicitly infuse geometric understanding into the +generation pipeline. Firstly, we employ depth maps derived from a high-quality +3D model as control signals to guarantee that the generated 2D images preserve +the fundamental shape and structure, thereby reducing the inherent viewpoint +bias. Next, we utilize deep geometric moments to ensure geometric consistency +in the 3D representation explicitly. By incorporating geometric details from a +3D asset, MT3D enables the creation of diverse and geometrically consistent +objects, thereby improving the quality and usability of our 3D representations. +Project page and code: https://moment-3d.github.io/ + +
+
+ comment: This paper has been accepted to WACV 2025 +
+
+
+
+
+ + ♻ ☆ AdjointDEIS: Efficient Gradients for Diffusion Models NeurIPS 2024 + + +
+ The optimization of the latents and parameters of diffusion models with +respect to some differentiable metric defined on the output of the model is a +challenging and complex problem. The sampling for diffusion models is done by +solving either the probability flow ODE or diffusion SDE wherein a neural +network approximates the score function allowing a numerical ODE/SDE solver to +be used. However, naive backpropagation techniques are memory intensive, +requiring the storage of all intermediate states, and face additional +complexity in handling the injected noise from the diffusion term of the +diffusion SDE. We propose a novel family of bespoke ODE solvers to the +continuous adjoint equations for diffusion models, which we call AdjointDEIS. +We exploit the unique construction of diffusion SDEs to further simplify the +formulation of the continuous adjoint equations using exponential integrators. +Moreover, we provide convergence order guarantees for our bespoke solvers. +Significantly, we show that continuous adjoint equations for diffusion SDEs +actually simplify to a simple ODE. Lastly, we demonstrate the effectiveness of +AdjointDEIS for guided generation with an adversarial attack in the form of the +face morphing problem. Our code will be released at https: +//github.com/zblasingame/AdjointDEIS. + +
+
+ comment: NeurIPS 2024 conference paper +
+
+
+
+
+ + ♻ ☆ Application-driven Validation of Posteriors in Inverse Problems + + +
+ Current deep learning-based solutions for image analysis tasks are commonly +incapable of handling problems to which multiple different plausible solutions +exist. In response, posterior-based methods such as conditional Diffusion +Models and Invertible Neural Networks have emerged; however, their translation +is hampered by a lack of research on adequate validation. In other words, the +way progress is measured often does not reflect the needs of the driving +practical application. Closing this gap in the literature, we present the first +systematic framework for the application-driven validation of posterior-based +methods in inverse problems. As a methodological novelty, it adopts key +principles from the field of object detection validation, which has a long +history of addressing the question of how to locate and match multiple object +instances in an image. Treating modes as instances enables us to perform +mode-centric validation, using well-interpretable metrics from the application +perspective. We demonstrate the value of our framework through instantiations +for a synthetic toy example and two medical vision use cases: pose estimation +in surgery and imaging-based quantification of functional tissue parameters for +diagnostics. Our framework offers key advantages over common approaches to +posterior validation in all three examples and could thus revolutionize +performance assessment in inverse problems. + +
+
+ comment: Accepted at Medical Image Analysis. Shared first authors: Tim J. + Adler and Jan-Hinrich N\"olke. 24 pages, 9 figures, 1 table +
+
+
+
+
+ + ♻ ☆ GL-MCM: Global and Local Maximum Concept Matching for Zero-Shot + Out-of-Distribution Detection + + +
+ Zero-shot out-of-distribution (OOD) detection is a task that detects OOD +images during inference with only in-distribution (ID) class names. Existing +methods assume ID images contain a single, centered object, and do not consider +the more realistic multi-object scenarios, where both ID and OOD objects are +present. To meet the needs of many users, the detection method must have the +flexibility to adapt the type of ID images. To this end, we present +Global-Local Maximum Concept Matching (GL-MCM), which incorporates local image +scores as an auxiliary score to enhance the separability of global and local +visual features. Due to the simple ensemble score function design, GL-MCM can +control the type of ID images with a single weight parameter. Experiments on +ImageNet and multi-object benchmarks demonstrate that GL-MCM outperforms +baseline zero-shot methods and is comparable to fully supervised methods. +Furthermore, GL-MCM offers strong flexibility in adjusting the target type of +ID images. The code is available via https://github.com/AtsuMiyai/GL-MCM. + +
+
+ comment: Accepted by International Journal of Computer Vision (IJCV) 2025 +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 0000000..69f5da7 --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`