From b965661a25edc70e1d7aa320632a6349879983d5 Mon Sep 17 00:00:00 2001 From: VinPPP Date: Mon, 20 Jan 2025 05:28:14 +0000 Subject: [PATCH] deploy: 7b7f942af1ae56f816bee4f5e57e1f0c5f193e87 --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 28678 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 29073 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/cache.json b/cache.json new file mode 100644 index 0000000..04b961b --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2025-01-13T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.07566v1","updated":"2025-01-13T18:54:02Z","published":"2025-01-13T18:54:02Z","title":"SafeSwarm: Decentralized Safe RL for the Swarm of Drones Landing in\n Dense Crowds","summary":" This paper introduces a safe swarm of drones capable of performing landings\nin crowded environments robustly by relying on Reinforcement Learning\ntechniques combined with Safe Learning. The developed system allows us to teach\nthe swarm of drones with different dynamics to land on moving landing pads in\nan environment while avoiding collisions with obstacles and between agents.\n The safe barrier net algorithm was developed and evaluated using a swarm of\nCrazyflie 2.1 micro quadrotors, which were tested indoors with the Vicon motion\ncapture system to ensure precise localization and control.\n Experimental results show that our system achieves landing accuracy of 2.25\ncm with a mean time of 17 s and collision-free landings, underscoring its\neffectiveness and robustness in real-world scenarios. This work offers a\npromising foundation for applications in environments where safety and\nprecision are paramount.\n","authors":["Grik Tadevosyan","Maksim Osipenko","Demetros Aschu","Aleksey Fedoseev","Valerii Serpiva","Oleg Sautenkov","Sausar Karaf","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2501.07566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04987v2","updated":"2025-01-13T18:24:22Z","published":"2024-11-07T18:55:10Z","title":"Few-Shot Task Learning through Inverse Generative Modeling","summary":" Learning the intents of an agent, defined by its goals or motion style, is\noften extremely challenging from just a few examples. We refer to this problem\nas task concept learning and present our approach, Few-Shot Task Learning\nthrough Inverse Generative Modeling (FTL-IGM), which learns new task concepts\nby leveraging invertible neural generative models. The core idea is to pretrain\na generative model on a set of basic concepts and their demonstrations. Then,\ngiven a few demonstrations of a new concept (such as a new goal or a new\naction), our method learns the underlying concepts through backpropagation\nwithout updating the model weights, thanks to the invertibility of the\ngenerative model. We evaluate our method in five domains -- object\nrearrangement, goal-oriented navigation, motion caption of human actions,\nautonomous driving, and real-world table-top manipulation. Our experimental\nresults demonstrate that via the pretrained generative model, we successfully\nlearn novel concepts and generate agent plans or motion corresponding to these\nconcepts in (1) unseen environments and (2) in composition with training\nconcepts.\n","authors":["Aviv Netanyahu","Yilun Du","Antonia Bronars","Jyothish Pari","Joshua Tenenbaum","Tianmin Shu","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2411.04987v2.pdf","comment":"Added acknowledgment"},{"id":"http://arxiv.org/abs/2501.07507v1","updated":"2025-01-13T17:25:46Z","published":"2025-01-13T17:25:46Z","title":"Inductive Learning of Robot Task Knowledge from Raw Data and Online\n Expert Feedback","summary":" The increasing level of autonomy of robots poses challenges of trust and\nsocial acceptance, especially in human-robot interaction scenarios. This\nrequires an interpretable implementation of robotic cognitive capabilities,\npossibly based on formal methods as logics for the definition of task\nspecifications. However, prior knowledge is often unavailable in complex\nrealistic scenarios.\n In this paper, we propose an offline algorithm based on inductive logic\nprogramming from noisy examples to extract task specifications (i.e., action\npreconditions, constraints and effects) directly from raw data of few\nheterogeneous (i.e., not repetitive) robotic executions. Our algorithm\nleverages on the output of any unsupervised action identification algorithm\nfrom video-kinematic recordings. Combining it with the definition of very\nbasic, almost task-agnostic, commonsense concepts about the environment, which\ncontribute to the interpretability of our methodology, we are able to learn\nlogical axioms encoding preconditions of actions, as well as their effects in\nthe event calculus paradigm. Since the quality of learned specifications\ndepends mainly on the accuracy of the action identification algorithm, we also\npropose an online framework for incremental refinement of task knowledge from\nuser feedback, guaranteeing safe execution. Results in a standard manipulation\ntask and benchmark for user training in the safety-critical surgical robotic\nscenario, show the robustness, data- and time-efficiency of our methodology,\nwith promising results towards the scalability in more complex domains.\n","authors":["Daniele Meli","Paolo Fiorini"],"pdf_url":"https://arxiv.org/pdf/2501.07507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07462v1","updated":"2025-01-13T16:32:13Z","published":"2025-01-13T16:32:13Z","title":"The Sense of Agency in Assistive Robotics Using Shared Autonomy","summary":" Sense of agency is one factor that influences people's preferences for robot\nassistance and a phenomenon from cognitive science that represents the\nexperience of control over one's environment. However, in assistive robotics\nliterature, we often see paradigms that optimize measures like task success and\ncognitive load, rather than sense of agency. In fact, prior work has found that\nparticipants sometimes express a preference for paradigms, such as direct\nteleoperation, which do not perform well with those other metrics but give more\ncontrol to the user. In this work, we focus on a subset of assistance paradigms\nfor manipulation called shared autonomy in which the system combines control\nsignals from the user and the automated control. We run a study to evaluate\nsense of agency and show that higher robot autonomy during assistance leads to\nimproved task performance but a decreased sense of agency, indicating a\npotential trade-off between task performance and sense of agency. From our\nfindings, we discuss the relation between sense of agency and optimality, and\nwe consider a proxy metric for a component of sense of agency which might\nenable us to build systems that monitor and maintain sense of agency in real\ntime.\n","authors":["Maggie A. Collier","Rithika Narayan","Henny Admoni"],"pdf_url":"https://arxiv.org/pdf/2501.07462v1.pdf","comment":"10 pages, 8 figure, HRI conference"},{"id":"http://arxiv.org/abs/2501.07421v1","updated":"2025-01-13T15:41:18Z","published":"2025-01-13T15:41:18Z","title":"Empirical Comparison of Four Stereoscopic Depth Sensing Cameras for\n Robotics Applications","summary":" Depth sensing is an essential technology in robotics and many other fields.\nMany depth sensing (or RGB-D) cameras are available on the market and selecting\nthe best one for your application can be challenging. In this work, we tested\nfour stereoscopic RGB-D cameras that sense the distance by using two images\nfrom slightly different views. We empirically compared four cameras (Intel\nRealSense D435, Intel RealSense D455, StereoLabs ZED 2, and Luxonis OAK-D Pro)\nin three scenarios: (i) planar surface perception, (ii) plastic doll\nperception, (iii) household object perception (YCB dataset). We recorded and\nevaluated more than 3,000 RGB-D frames for each camera. For table-top robotics\nscenarios with distance to objects up to one meter, the best performance is\nprovided by the D435 camera. For longer distances, the other three models\nperform better, making them more suitable for some mobile robotics\napplications. OAK-D Pro additionally offers integrated AI modules (e.g., object\nand human keypoint detection). ZED 2 is not a standalone device and requires a\ncomputer with a GPU for depth data acquisition. All data (more than 12,000\nRGB-D frames) are publicly available at https://osf.io/f2seb.\n","authors":["Lukas Rustler","Vojtech Volprecht","Matej Hoffmann"],"pdf_url":"https://arxiv.org/pdf/2501.07421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07399v1","updated":"2025-01-13T15:17:10Z","published":"2025-01-13T15:17:10Z","title":"Efficiently Closing Loops in LiDAR-Based SLAM Using Point Cloud Density\n Maps","summary":" Consistent maps are key for most autonomous mobile robots. They often use\nSLAM approaches to build such maps. Loop closures via place recognition help\nmaintain accurate pose estimates by mitigating global drift. This paper\npresents a robust loop closure detection pipeline for outdoor SLAM with\nLiDAR-equipped robots. The method handles various LiDAR sensors with different\nscanning patterns, field of views and resolutions. It generates local maps from\nLiDAR scans and aligns them using a ground alignment module to handle both\nplanar and non-planar motion of the LiDAR, ensuring applicability across\nplatforms. The method uses density-preserving bird's eye view projections of\nthese local maps and extracts ORB feature descriptors from them for place\nrecognition. It stores the feature descriptors in a binary search tree for\nefficient retrieval, and self-similarity pruning addresses perceptual aliasing\nin repetitive environments. Extensive experiments on public and self-recorded\ndatasets demonstrate accurate loop closure detection, long-term localization,\nand cross-platform multi-map alignment, agnostic to the LiDAR scanning\npatterns, fields of view, and motion profiles.\n","authors":["Saurabh Gupta","Tiziano Guadagnino","Benedikt Mersch","Niklas Trekel","Meher V. R. Malladi","Cyrill Stachniss"],"pdf_url":"https://arxiv.org/pdf/2501.07399v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08094v2","updated":"2025-01-13T14:53:11Z","published":"2023-05-14T08:10:49Z","title":"Accelerating genetic optimization of nonlinear model predictive control\n by learning optimal search space size","summary":" Genetic algorithm (GA) is typically used to solve nonlinear model predictive\ncontrol's optimization problem. However, the size of the search space in which\nthe GA searches for the optimal control inputs is crucial for its applicability\nto fast-response systems. This paper proposes accelerating the genetic\noptimization of NMPC by learning optimal search space size. The approach trains\na multivariate regression model to adaptively predict the best smallest size of\nthe search space in every control cycle. The proposed approach reduces the GA's\ncomputational time, improves the chance of convergence to better control\ninputs, and provides a stable and feasible solution. The proposed approach was\nevaluated on three nonlinear systems and compared to four other evolutionary\nalgorithms implemented in a processor-in-the-loop fashion. The results show\nthat the proposed approach provides a 17-45\\% reduction in computational time\nand increases the convergence rate by 35-47\\%. The source code is available on\nGitHub.\n","authors":["Eslam Mostafa","Hussein A. Aly","Ahmed Elliethy"],"pdf_url":"https://arxiv.org/pdf/2305.08094v2.pdf","comment":"Accepted by the Journal of Control and Decision"},{"id":"http://arxiv.org/abs/2412.19706v3","updated":"2025-01-13T14:15:59Z","published":"2024-12-27T16:00:24Z","title":"Geometric Freeze-Tag Problem","summary":" We study the Freeze-Tag Problem (FTP), introduced by Arkin et al. (SODA'02),\nwhere the objective is to activate a group of n robots, starting from a single\ninitially active robot. Robots are positioned in $\\mathbb{R}^d$, and once\nactivated, they move at a constant speed to wake up others. The goal is to\nminimize the time required to activate the last robot, known as the makespan.\nWe establish new upper bounds for the makespan under the $l_1$ and $l_2$ norms\nin $\\mathbb{R}^2$ and $\\mathbb{R}^3$. Specifically, we improve the previous\nupper bound for $(\\mathbb{R}^2, l_2)$ from $7.07r$ (Bonichon et al., DISC'24)\nto $5.064r$. For $(\\mathbb{R}^3, l_1)$, we derive a makespan bound of $13r$,\nwhich translates to $22.52r$ for $(\\mathbb{R}^3, l_2)$. Here, $r$ denotes the\nmaximum distance of any robot from the initially active robot under the given\nnorm. To our knowledge, these are the first makespan bounds for FTP in\n$\\mathbb{R}^3$. Additionally, we show that the maximum makespan for $n$ robots\nis not necessarily achieved when robots are equally distributed along the\nboundary in $(\\mathbb{R}^2, l_2)$. We further investigate FTP in\n$(\\mathbb{R}^3, l_2)$ for specific configurations where robots lie on a\nboundary, providing insights into practical scenarios.\n","authors":["Sharareh Alipour","Kajal Baghestani","Mahdis Mirzaei","Soroush Sahraei"],"pdf_url":"https://arxiv.org/pdf/2412.19706v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06782v2","updated":"2025-01-13T14:11:49Z","published":"2024-11-11T08:19:54Z","title":"QuadWBG: Generalizable Quadrupedal Whole-Body Grasping","summary":" Legged robots with advanced manipulation capabilities have the potential to\nsignificantly improve household duties and urban maintenance. Despite\nconsiderable progress in developing robust locomotion and precise manipulation\nmethods, seamlessly integrating these into cohesive whole-body control for\nreal-world applications remains challenging. In this paper, we present a\nmodular framework for robust and generalizable whole-body loco-manipulation\ncontroller based on a single arm-mounted camera. By using reinforcement\nlearning (RL), we enable a robust low-level policy for command execution over 5\ndimensions (5D) and a grasp-aware high-level policy guided by a novel metric,\nGeneralized Oriented Reachability Map (GORM). The proposed system achieves\nstate-of-the-art one-time grasping accuracy of 89% in the real world, including\nchallenging tasks such as grasping transparent objects. Through extensive\nsimulations and real-world experiments, we demonstrate that our system can\neffectively manage a large workspace, from floor level to above body height,\nand perform diverse whole-body loco-manipulation tasks.\n","authors":["Jilong Wang","Javokhirbek Rajabov","Chaoyi Xu","Yiming Zheng","He Wang"],"pdf_url":"https://arxiv.org/pdf/2411.06782v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07343v1","updated":"2025-01-13T13:57:37Z","published":"2025-01-13T13:57:37Z","title":"Fast-Revisit Coverage Path Planning for Autonomous Mobile Patrol Robots\n Using Long-Range Sensor Information","summary":" The utilization of Unmanned Ground Vehicles (UGVs) for patrolling industrial\nsites has expanded significantly. These UGVs typically are equipped with\nperception systems, e.g., computer vision, with limited range due to sensor\nlimitations or site topology. High-level control of the UGVs requires Coverage\nPath Planning (CPP) algorithms that navigate all relevant waypoints and\npromptly start the next cycle. In this paper, we propose the novel Fast-Revisit\nCoverage Path Planning (FaRe-CPP) algorithm using a greedy heuristic approach\nto propose waypoints for maximum coverage area and a random search-based path\noptimization technique to obtain a path along the proposed waypoints with\nminimum revisit time. We evaluated the algorithm in a simulated environment\nusing Gazebo and a camera-equipped TurtleBot3 against a number of existing\nalgorithms. Compared to their average revisit times and path lengths, our\nFaRe-CPP algorithm approximately showed a 45% and 40% reduction, respectively,\nin these highly relevant performance indicators.\n","authors":["Srinivas Kachavarapu","Tobias Doernbach","Reinhard Gerndt"],"pdf_url":"https://arxiv.org/pdf/2501.07343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07317v1","updated":"2025-01-13T13:28:03Z","published":"2025-01-13T13:28:03Z","title":"Evaluation of Artificial Intelligence Methods for Lead Time Prediction\n in Non-Cycled Areas of Automotive Production","summary":" The present study examines the effectiveness of applying Artificial\nIntelligence methods in an automotive production environment to predict unknown\nlead times in a non-cycle-controlled production area. Data structures are\nanalyzed to identify contextual features and then preprocessed using one-hot\nencoding. Methods selection focuses on supervised machine learning techniques.\nIn supervised learning methods, regression and classification methods are\nevaluated. Continuous regression based on target size distribution is not\nfeasible. Classification methods analysis shows that Ensemble Learning and\nSupport Vector Machines are the most suitable. Preliminary study results\nindicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost\nyield the best results. After further testing and extensive hyperparameter\noptimization, the final method choice is the LightGBM algorithm. Depending on\nfeature availability and prediction interval granularity, relative prediction\naccuracies of up to 90% can be achieved. Further tests highlight the importance\nof periodic retraining of AI models to accurately represent complex production\nprocesses using the database. The research demonstrates that AI methods can be\neffectively applied to highly variable production data, adding business value\nby providing an additional metric for various control tasks while outperforming\ncurrent non AI-based systems.\n","authors":["Cornelius Hake","Jonas Weigele","Frederik Reichert","Christian Friedrich"],"pdf_url":"https://arxiv.org/pdf/2501.07317v1.pdf","comment":"7 pages, 4 figures, CLC2024 Conference"},{"id":"http://arxiv.org/abs/2501.07299v1","updated":"2025-01-13T13:07:20Z","published":"2025-01-13T13:07:20Z","title":"ViewVR: Visual Feedback Modes to Achieve Quality of VR-based\n Telemanipulation","summary":" The paper focuses on an immersive teleoperation system that enhances\noperator's ability to actively perceive the robot's surroundings. A\nconsumer-grade HTC Vive VR system was used to synchronize the operator's hand\nand head movements with a UR3 robot and a custom-built robotic head with two\ndegrees of freedom (2-DoF). The system's usability, manipulation efficiency,\nand intuitiveness of control were evaluated in comparison with static head\ncamera positioning across three distinct tasks. Code and other supplementary\nmaterials can be accessed by link: https://github.com/ErkhovArtem/ViewVR\n","authors":["A. Erkhov","A. Bazhenov","S. Satsevich","D. Belov","F. Khabibullin","S. Egorov","M. Gromakov","M. Altamirano Cabrera","D. Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2501.07299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07295v1","updated":"2025-01-13T13:01:21Z","published":"2025-01-13T13:01:21Z","title":"GestLLM: Advanced Hand Gesture Interpretation via Large Language Models\n for Human-Robot Interaction","summary":" This paper introduces GestLLM, an advanced system for human-robot interaction\nthat enables intuitive robot control through hand gestures. Unlike conventional\nsystems, which rely on a limited set of predefined gestures, GestLLM leverages\nlarge language models and feature extraction via MediaPipe to interpret a\ndiverse range of gestures. This integration addresses key limitations in\nexisting systems, such as restricted gesture flexibility and the inability to\nrecognize complex or unconventional gestures commonly used in human\ncommunication.\n By combining state-of-the-art feature extraction and language model\ncapabilities, GestLLM achieves performance comparable to leading\nvision-language models while supporting gestures underrepresented in\ntraditional datasets. For example, this includes gestures from popular culture,\nsuch as the ``Vulcan salute\" from Star Trek, without any additional\npretraining, prompt engineering, etc. This flexibility enhances the naturalness\nand inclusivity of robot control, making interactions more intuitive and\nuser-friendly.\n GestLLM provides a significant step forward in gesture-based interaction,\nenabling robots to understand and respond to a wide variety of hand gestures\neffectively. This paper outlines its design, implementation, and evaluation,\ndemonstrating its potential applications in advanced human-robot collaboration,\nassistive robotics, and interactive entertainment.\n","authors":["Oleg Kobzarev","Artem Lykov","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2501.07295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07259v1","updated":"2025-01-13T12:14:48Z","published":"2025-01-13T12:14:48Z","title":"PO-GVINS: Tightly Coupled GNSS-Visual-Inertial Integration with\n Pose-Only Representation","summary":" Accurate and reliable positioning is crucial for perception, decision-making,\nand other high-level applications in autonomous driving, unmanned aerial\nvehicles, and intelligent robots. Given the inherent limitations of standalone\nsensors, integrating heterogeneous sensors with complementary capabilities is\none of the most effective approaches to achieving this goal. In this paper, we\npropose a filtering-based, tightly coupled global navigation satellite system\n(GNSS)-visual-inertial positioning framework with a pose-only formulation\napplied to the visual-inertial system (VINS), termed PO-GVINS. Specifically,\nmultiple-view imaging used in current VINS requires a priori of 3D feature,\nthen jointly estimate camera poses and 3D feature position, which inevitably\nintroduces linearization error of the feature as well as facing dimensional\nexplosion. However, the pose-only (PO) formulation, which is demonstrated to be\nequivalent to the multiple-view imaging and has been applied in visual\nreconstruction, represent feature depth using two camera poses and thus 3D\nfeature position is removed from state vector avoiding aforementioned\ndifficulties. Inspired by this, we first apply PO formulation in our VINS,\ni.e., PO-VINS. GNSS raw measurements are then incorporated with integer\nambiguity resolved to achieve accurate and drift-free estimation. Extensive\nexperiments demonstrate that the proposed PO-VINS significantly outperforms the\nmulti-state constrained Kalman filter (MSCKF). By incorporating GNSS\nmeasurements, PO-GVINS achieves accurate, drift-free state estimation, making\nit a robust solution for positioning in challenging environments.\n","authors":["Zhuo Xu","Feng Zhu","Zihang Zhang","Chang Jian","Jiarui Lv","Yuantai Zhang","Xiaohong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07255v1","updated":"2025-01-13T12:06:58Z","published":"2025-01-13T12:06:58Z","title":"GazeGrasp: DNN-Driven Robotic Grasping with Wearable Eye-Gaze Interface","summary":" We present GazeGrasp, a gaze-based manipulation system enabling individuals\nwith motor impairments to control collaborative robots using eye-gaze. The\nsystem employs an ESP32 CAM for eye tracking, MediaPipe for gaze detection, and\nYOLOv8 for object localization, integrated with a Universal Robot UR10 for\nmanipulation tasks. After user-specific calibration, the system allows\nintuitive object selection with a magnetic snapping effect and robot control\nvia eye gestures. Experimental evaluation involving 13 participants\ndemonstrated that the magnetic snapping effect significantly reduced gaze\nalignment time, improving task efficiency by 31%. GazeGrasp provides a robust,\nhands-free interface for assistive robotics, enhancing accessibility and\nautonomy for users.\n","authors":["Issatay Tokmurziyev","Miguel Altamirano Cabrera","Luis Moreno","Muhammad Haris Khan","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2501.07255v1.pdf","comment":"Accepted to: IEEE/ACM International Conference on Human-Robot\n Interaction (HRI 2025)"},{"id":"http://arxiv.org/abs/2412.20104v2","updated":"2025-01-13T11:46:06Z","published":"2024-12-28T10:12:12Z","title":"SyncDiff: Synchronized Motion Diffusion for Multi-Body Human-Object\n Interaction Synthesis","summary":" Synthesizing realistic human-object interaction motions is a critical problem\nin VR/AR and human animation. Unlike the commonly studied scenarios involving a\nsingle human or hand interacting with one object, we address a more generic\nmulti-body setting with arbitrary numbers of humans, hands, and objects. This\ncomplexity introduces significant challenges in synchronizing motions due to\nthe high correlations and mutual influences among bodies. To address these\nchallenges, we introduce SyncDiff, a novel method for multi-body interaction\nsynthesis using a synchronized motion diffusion strategy. SyncDiff employs a\nsingle diffusion model to capture the joint distribution of multi-body motions.\nTo enhance motion fidelity, we propose a frequency-domain motion decomposition\nscheme. Additionally, we introduce a new set of alignment scores to emphasize\nthe synchronization of different body motions. SyncDiff jointly optimizes both\ndata sample likelihood and alignment likelihood through an explicit\nsynchronization strategy. Extensive experiments across four datasets with\nvarious multi-body configurations demonstrate the superiority of SyncDiff over\nexisting state-of-the-art motion synthesis methods.\n","authors":["Wenkun He","Yun Liu","Ruitao Liu","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2412.20104v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07224v1","updated":"2025-01-13T11:22:57Z","published":"2025-01-13T11:22:57Z","title":"Touched by ChatGPT: Using an LLM to Drive Affective Tactile Interaction","summary":" Touch is a fundamental aspect of emotion-rich communication, playing a vital\nrole in human interaction and offering significant potential in human-robot\ninteraction. Previous research has demonstrated that a sparse representation of\nhuman touch can effectively convey social tactile signals. However, advances in\nhuman-robot tactile interaction remain limited, as many humanoid robots possess\nsimplistic capabilities, such as only opening and closing their hands,\nrestricting nuanced tactile expressions. In this study, we explore how a robot\ncan use sparse representations of tactile vibrations to convey emotions to a\nperson. To achieve this, we developed a wearable sleeve integrated with a 5x5\ngrid of vibration motors, enabling the robot to communicate diverse tactile\nemotions and gestures. Using chain prompts within a Large Language Model (LLM),\nwe generated distinct 10-second vibration patterns corresponding to 10 emotions\n(e.g., happiness, sadness, fear) and 6 touch gestures (e.g., pat, rub, tap).\nParticipants (N = 32) then rated each vibration stimulus based on perceived\nvalence and arousal. People are accurate at recognising intended emotions, a\nresult which aligns with earlier findings. These results highlight the LLM's\nability to generate emotional haptic data and effectively convey emotions\nthrough tactile signals. By translating complex emotional and tactile\nexpressions into vibratory patterns, this research demonstrates how LLMs can\nenhance physical interaction between humans and robots.\n","authors":["Qiaoqiao Ren","Tony Belpaeme"],"pdf_url":"https://arxiv.org/pdf/2501.07224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07223v1","updated":"2025-01-13T11:21:53Z","published":"2025-01-13T11:21:53Z","title":"Improving Incremental Nonlinear Dynamic Inversion Robustness Using\n Robust Control in Aerial Robotics","summary":" Improving robustness to uncertainty and rejection of external disturbances\nrepresents a significant challenge in aerial robotics. Nonlinear controllers\nbased on Incremental Nonlinear Dynamic Inversion (INDI), known for their\nability in estimating disturbances through measured-filtered data, have been\nnotably used in such applications. Typically, these controllers comprise two\ncascaded loops: an inner loop employing nonlinear dynamic inversion and an\nouter loop generating the virtual control inputs via linear controllers. In\nthis paper, a novel methodology is introduced, that combines the advantages of\nINDI with the robustness of linear structured $\\mathcal{H}_\\infty$ controllers.\nA full cascaded architecture is proposed to control the dynamics of a\nmultirotor drone, covering both stabilization and guidance. In particular,\nlow-order $\\mathcal{H}_\\infty$ controllers are designed for the outer loop by\nproperly structuring the problem and solving it through non-smooth\noptimization. A comparative analysis is conducted between an existing INDI/PD\napproach and the proposed INDI/$\\mathcal{H}_\\infty$ strategy, showing a notable\nenhancement in the rejection of external disturbances. It is carried out first\nusing MATLAB simulations involving a nonlinear model of a Parrot Bebop\nquadcopter drone, and then experimentally using a customized quadcopter built\nby the ENAC team. The results show an improvement of more than 50\\% in the\nrejection of disturbances such as gusts.\n","authors":["Mohamad Hachem","Clément Roos","Thierry Miquel","Murat Bronz"],"pdf_url":"https://arxiv.org/pdf/2501.07223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07216v1","updated":"2025-01-13T11:14:05Z","published":"2025-01-13T11:14:05Z","title":"Temperature Driven Multi-modal/Single-actuated Soft Finger","summary":" Soft pneumatic fingers are of great research interest. However, their\nsignificant potential is limited as most of them can generate only one motion,\nmostly bending. The conventional design of soft fingers does not allow them to\nswitch to another motion mode. In this paper, we developed a novel multi-modal\nand single-actuated soft finger where its motion mode is switched by changing\nthe finger's temperature. Our soft finger is capable of switching between three\ndistinctive motion modes: bending, twisting, and extension-in approximately\nfive seconds. We carried out a detailed experimental study of the soft finger\nand evaluated its repeatability and range of motion. It exhibited repeatability\nof around one millimeter and a fifty percent larger range of motion than a\nstandard bending actuator. We developed an analytical model for a\nfiber-reinforced soft actuator for twisting motion. This helped us relate the\ninput pressure to the output twist radius of the twisting motion. This model\nwas validated by experimental verification. Further, a soft robotic gripper\nwith multiple grasp modes was developed using three actuators. This gripper can\nadapt to and grasp objects of a large range of size, shape, and stiffness. We\nshowcased its grasping capabilities by successfully grasping a small berry, a\nlarge roll, and a delicate tofu cube.\n","authors":["Prashant Kumar","Weiwei Wan","Kensuke Harada"],"pdf_url":"https://arxiv.org/pdf/2501.07216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07213v1","updated":"2025-01-13T11:12:47Z","published":"2025-01-13T11:12:47Z","title":"Multi-face emotion detection for effective Human-Robot Interaction","summary":" The integration of dialogue interfaces in mobile devices has become\nubiquitous, providing a wide array of services. As technology progresses,\nhumanoid robots designed with human-like features to interact effectively with\npeople are gaining prominence, and the use of advanced human-robot dialogue\ninterfaces is continually expanding. In this context, emotion recognition plays\na crucial role in enhancing human-robot interaction by enabling robots to\nunderstand human intentions. This research proposes a facial emotion detection\ninterface integrated into a mobile humanoid robot, capable of displaying\nreal-time emotions from multiple individuals on a user interface. To this end,\nvarious deep neural network models for facial expression recognition were\ndeveloped and evaluated under consistent computer-based conditions, yielding\npromising results. Afterwards, a trade-off between accuracy and memory\nfootprint was carefully considered to effectively implement this application on\na mobile humanoid robot.\n","authors":["Mohamed Ala Yahyaoui","Mouaad Oujabour","Leila Ben Letaifa","Amine Bohi"],"pdf_url":"https://arxiv.org/pdf/2501.07213v1.pdf","comment":"9 pages, 8 figures and 1 table. Accepted at the 17th International\n Conference on Agents and Artificial Intelligence (ICAART 2025), Porto,\n Portugal"},{"id":"http://arxiv.org/abs/2501.07180v1","updated":"2025-01-13T10:19:30Z","published":"2025-01-13T10:19:30Z","title":"Evaluating Robotic Approach Techniques for the Insertion of a Straight\n Instrument into a Vitreoretinal Surgery Trocar","summary":" Advances in vitreoretinal robotic surgery enable precise techniques for gene\ntherapies. This study evaluates three robotic approaches using the 7-DoF\nrobotic arm for docking a micro-precise tool to a trocar: fully co-manipulated,\nhybrid co-manipulated/teleoperated, and hybrid with camera assistance. The\nfully co-manipulated approach was the fastest but had a 42% success rate.\nHybrid methods showed higher success rates (91.6% and 100%) and completed tasks\nwithin 2 minutes. NASA Task Load Index (TLX) assessments indicated lower\nphysical demand and effort for hybrid approaches.\n","authors":["Ross Henry","Martin Huber","Anestis Mablekos-Alexiou","Carlo Seneci","Mohamed Abdelaziz","Hans Natalius","Lyndon da Cruz","Christos Bergeles"],"pdf_url":"https://arxiv.org/pdf/2501.07180v1.pdf","comment":"2 Pages, 2 Figures, 1 Table"},{"id":"http://arxiv.org/abs/2409.06501v3","updated":"2025-01-13T09:53:48Z","published":"2024-09-10T13:34:53Z","title":"An Adaptive Sliding Window Estimator for Positioning of Unmanned Aerial\n Vehicle Using a Single Anchor","summary":" Localization using a single range anchor combined with onboard\noptical-inertial odometry offers a lightweight solution that provides\nmultidimensional measurements for the positioning of unmanned aerial vehicles.\nUnfortunately, the performance of such lightweight sensors varies with the\ndynamic environment, and the fidelity of the dynamic model is also severely\naffected by environmental aerial flow. To address this challenge, we propose an\nadaptive sliding window estimator equipped with an estimation reliability\nevaluator, where the states, noise covariance matrices and aerial drag are\nestimated simultaneously. The aerial drag effects are first evaluated based on\nposterior states and covariance. Then, an augmented Kalman filter is designed\nto pre-process multidimensional measurements and inherit historical\ninformation. Subsequently, an inverse-Wishart smoother is employed to estimate\nposterior states and covariance matrices. To further suppress potential\ndivergence, a reliability evaluator is devised to infer estimation errors. We\nfurther determine the fidelity of each sensor based on the error propagation.\nExtensive experiments are conducted in both standard and harsh environments,\ndemonstrating the adaptability and robustness of the proposed method. The root\nmean square error reaches 0.15 m, outperforming the state-of-the-art approach.\n","authors":["Kaiwen Xiong","Sijia Chen","Wei Dong"],"pdf_url":"https://arxiv.org/pdf/2409.06501v3.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2407.11218v3","updated":"2025-01-13T09:23:41Z","published":"2024-07-15T20:07:33Z","title":"Walk along: An Experiment on Controlling the Mobile Robot 'Spot' with\n Voice and Gestures","summary":" Robots are becoming more capable and can autonomously perform tasks such as\nnavigating between locations. However, human oversight remains crucial. This\nstudy compared two touchless methods for directing mobile robots: voice control\nand gesture control, to investigate the efficiency of the methods and the\npreference of users. We tested these methods in two conditions: one in which\nparticipants remained stationary and one in which they walked freely alongside\nthe robot. We hypothesized that walking alongside the robot would result in\nhigher intuitiveness ratings and improved task performance, based on the idea\nthat walking promotes spatial alignment and reduces the effort required for\nmental rotation. In a 2x2 within-subject design, 218 participants guided the\nquadruped robot Spot along a circuitous route with multiple 90-degree turns\nusing rotate left, rotate right, and walk forward commands. After each trial,\nparticipants rated the intuitiveness of the command mapping, while\npost-experiment interviews were used to gather the participants' preferences.\nResults showed that voice control combined with walking with Spot was the most\nfavored and intuitive, whereas gesture control while standing caused confusion\nfor left/right commands. Nevertheless, 29% of participants preferred gesture\ncontrol, citing increased task engagement and visual congruence as reasons. An\nodometry-based analysis revealed that participants often followed behind Spot,\nparticularly in the gesture control condition, when they were allowed to walk.\nIn conclusion, voice control with walking produced the best outcomes. Improving\nphysical ergonomics and adjusting gesture types could make gesture control more\neffective.\n","authors":["Renchi Zhang","Jesse van der Linden","Dimitra Dodou","Harleigh Seyffert","Yke Bauke Eisma","Joost C. F. de Winter"],"pdf_url":"https://arxiv.org/pdf/2407.11218v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01144v3","updated":"2025-01-13T08:40:27Z","published":"2024-09-02T10:28:18Z","title":"Adaptive Non-linear Centroidal MPC with Stability Guarantees for Robust\n Locomotion of Legged Robots","summary":" Nonlinear model predictive locomotion controllers based on the reduced\ncentroidal dynamics are nowadays ubiquitous in legged robots. These schemes,\neven if they assume an inherent simplification of the robot's dynamics, were\nshown to endow robots with a step-adjustment capability in reaction to small\npushes, and, moreover, in the case of uncertain parameters - as unknown\npayloads - they were shown to be able to provide some practical, albeit\nlimited, robustness. In this work, we provide rigorous certificates of their\nclosed loop stability via a reformulation of the centroidal MPC controller.\nThis is achieved thanks to a systematic procedure inspired by the machinery of\nadaptive control, together with ideas coming from Control Lyapunov functions.\nOur reformulation, in addition, provides robustness for a class of unmeasured\nconstant disturbances. To demonstrate the generality of our approach, we\nvalidated our formulation on a new generation of humanoid robots - the 56.7 kg\nergoCub, as well as on a commercially available 21 kg quadruped robot, Aliengo.\n","authors":["Mohamed Elobaid","Giulio Turrisi","Lorenzo Rapetti","Giulio Romualdi","Stefano Dafarra","Tomohiro Kawakami","Tomohiro Chaki","Takahide Yoshiike","Claudio Semini","Daniele Pucci"],"pdf_url":"https://arxiv.org/pdf/2409.01144v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14797v2","updated":"2025-01-13T07:47:32Z","published":"2024-07-20T07:56:24Z","title":"From Underground Mines to Offices: A Versatile and Robust Framework for\n Range-Inertial SLAM","summary":" Simultaneous Localization and Mapping (SLAM) is an essential component of\nautonomous robotic applications and self-driving vehicles, enabling them to\nunderstand and operate in their environment. Many SLAM systems have been\nproposed in the last decade, but they are often complex to adapt to different\nsettings or sensor setups. In this work, we present LiDAR Graph-SLAM (LG-SLAM),\na versatile range-inertial SLAM framework that can be adapted to different\ntypes of sensors and environments, from underground mines to offices with\nminimal parameter tuning. Our system integrates range, inertial and GNSS\nmeasurements into a graph-based optimization framework. We also use a refined\nsubmap management approach and a robust loop closure method that effectively\naccounts for uncertainty in the identification and validation of putative loop\nclosures, ensuring global consistency and robustness. Enabled by a parallelized\narchitecture and GPU integration, our system achieves pose estimation at LiDAR\nframe rate, along with online loop closing and graph optimization. We validate\nour system in diverse environments using public datasets and real-world data,\nconsistently achieving an average error below 20 cm and outperforming other\nstate-of-the-art algorithms.\n","authors":["Lorenzo Montano-Oliván","Julio A. Placed","Luis Montano","María T. Lázaro"],"pdf_url":"https://arxiv.org/pdf/2407.14797v2.pdf","comment":"8 pages, 8 figures, 3 tables"},{"id":"http://arxiv.org/abs/2407.10031v2","updated":"2025-01-13T06:03:14Z","published":"2024-07-14T00:12:44Z","title":"LLaMAR: Long-Horizon Planning for Multi-Agent Robots in Partially\n Observable Environments","summary":" The ability of Language Models (LMs) to understand natural language makes\nthem a powerful tool for parsing human instructions into task plans for\nautonomous robots. Unlike traditional planning methods that rely on\ndomain-specific knowledge and handcrafted rules, LMs generalize from diverse\ndata and adapt to various tasks with minimal tuning, acting as a compressed\nknowledge base. However, LMs in their standard form face challenges with\nlong-horizon tasks, particularly in partially observable multi-agent settings.\nWe propose an LM-based Long-Horizon Planner for Multi-Agent Robotics (LLaMAR),\na cognitive architecture for planning that achieves state-of-the-art results in\nlong-horizon tasks within partially observable environments. LLaMAR employs a\nplan-act-correct-verify framework, allowing self-correction from action\nexecution feedback without relying on oracles or simulators. Additionally, we\npresent MAP-THOR, a comprehensive test suite encompassing household tasks of\nvarying complexity within the AI2-THOR environment. Experiments show that\nLLaMAR achieves a 30% higher success rate than other state-of-the-art LM-based\nmulti-agent planners in MAP-THOR and Search \\& Rescue tasks. Code can be found\nat https://github.com/nsidn98/LLaMAR\n","authors":["Siddharth Nayak","Adelmo Morrison Orozco","Marina Ten Have","Vittal Thirumalai","Jackson Zhang","Darren Chen","Aditya Kapoor","Eric Robinson","Karthik Gopalakrishnan","James Harrison","Brian Ichter","Anuj Mahajan","Hamsa Balakrishnan"],"pdf_url":"https://arxiv.org/pdf/2407.10031v2.pdf","comment":"27 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2501.07051v1","updated":"2025-01-13T04:18:52Z","published":"2025-01-13T04:18:52Z","title":"ROSAnnotator: A Web Application for ROSBag Data Analysis in Human-Robot\n Interaction","summary":" Human-robot interaction (HRI) is an interdisciplinary field that utilises\nboth quantitative and qualitative methods. While ROSBags, a file format within\nthe Robot Operating System (ROS), offer an efficient means of collecting\ntemporally synched multimodal data in empirical studies with real robots, there\nis a lack of tools specifically designed to integrate qualitative coding and\nanalysis functions with ROSBags. To address this gap, we developed\nROSAnnotator, a web-based application that incorporates a multimodal Large\nLanguage Model (LLM) to support both manual and automated annotation of ROSBag\ndata. ROSAnnotator currently facilitates video, audio, and transcription\nannotations and provides an open interface for custom ROS messages and tools.\nBy using ROSAnnotator, researchers can streamline the qualitative analysis\nprocess, create a more cohesive analysis pipeline, and quickly access\nstatistical summaries of annotations, thereby enhancing the overall efficiency\nof HRI data analysis. https://github.com/CHRI-Lab/ROSAnnotator\n","authors":["Yan Zhang","Haoqi Li","Ramtin Tabatabaei","Wafa Johal"],"pdf_url":"https://arxiv.org/pdf/2501.07051v1.pdf","comment":"Accepted to HRI 2025"},{"id":"http://arxiv.org/abs/2412.16908v2","updated":"2025-01-13T04:11:53Z","published":"2024-12-22T07:54:21Z","title":"Map Imagination Like Blind Humans: Group Diffusion Model for Robotic Map\n Generation","summary":" Can robots imagine or generate maps like humans do, especially when only\nlimited information can be perceived like blind people? To address this\nchallenging task, we propose a novel group diffusion model (GDM) based\narchitecture for robots to generate point cloud maps with very limited input\ninformation.Inspired from the blind humans' natural capability of imagining or\ngenerating mental maps, the proposed method can generate maps without visual\nperception data or depth data. With additional limited super-sparse spatial\npositioning data, like the extra contact-based positioning information the\nblind individuals can obtain, the map generation quality can be improved even\nmore.Experiments on public datasets are conducted, and the results indicate\nthat our method can generate reasonable maps solely based on path data, and\nproduce even more refined maps upon incorporating exiguous LiDAR data.Compared\nto conventional mapping approaches, our novel method significantly mitigates\nsensor dependency, enabling the robots to imagine and generate elementary maps\nwithout heavy onboard sensory devices.\n","authors":["Qijin Song","Weibang Bai"],"pdf_url":"https://arxiv.org/pdf/2412.16908v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05723v2","updated":"2025-01-13T02:58:58Z","published":"2025-01-10T05:43:34Z","title":"Robot Error Awareness Through Human Reactions: Implementation,\n Evaluation, and Recommendations","summary":" Effective error detection is crucial to prevent task disruption and maintain\nuser trust. Traditional methods often rely on task-specific models or user\nreporting, which can be inflexible or slow. Recent research suggests social\nsignals, naturally exhibited by users in response to robot errors, can enable\nmore flexible, timely error detection. However, most studies rely on post hoc\nanalysis, leaving their real-time effectiveness uncertain and lacking\nuser-centric evaluation. In this work, we developed a proactive error detection\nsystem that combines user behavioral signals (facial action units and speech),\nuser feedback, and error context for automatic error detection. In a study (N =\n28), we compared our proactive system to a status quo reactive approach.\nResults show our system 1) reliably and flexibly detects error, 2) detects\nerrors faster than the reactive approach, and 3) is perceived more favorably by\nusers than the reactive one. We discuss recommendations for enabling robot\nerror awareness in future HRI systems.\n","authors":["Maia Stiber","Russell Taylor","Chien-Ming Huang"],"pdf_url":"https://arxiv.org/pdf/2501.05723v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07013v1","updated":"2025-01-13T02:15:15Z","published":"2025-01-13T02:15:15Z","title":"Sthymuli: a Static Educational Robot. Leveraging the Thymio II Platform","summary":" The use of robots in education represents a challenge for teachers and a\nfixed vision of what robots can do for students. This paper presents the\ndevelopment of Sthymuli, a static educational robot designed to explore new\nclassroom interactions between robots, students and teachers. We propose the\nuse of the Thymio II educational platform as a base, ensuring a robust\nbenchmark for a fair comparison of the commonly available wheeled robots and\nour exploratory approach with Sthymuli. This paper outlines the constraints and\nrequirements for developing such a robot, the current state of development and\nfuture work.\n","authors":["Manuel Bernal-Lecina","Alejandrina Hernández","Adrien Pannatier","Léa Pereyre","Francesco Mondada"],"pdf_url":"https://arxiv.org/pdf/2501.07013v1.pdf","comment":"Two pages, three figures. ICRA40 extended abstract"},{"id":"http://arxiv.org/abs/2501.06994v1","updated":"2025-01-13T01:01:44Z","published":"2025-01-13T01:01:44Z","title":"Motion Tracks: A Unified Representation for Human-Robot Transfer in\n Few-Shot Imitation Learning","summary":" Teaching robots to autonomously complete everyday tasks remains a challenge.\nImitation Learning (IL) is a powerful approach that imbues robots with skills\nvia demonstrations, but is limited by the labor-intensive process of collecting\nteleoperated robot data. Human videos offer a scalable alternative, but it\nremains difficult to directly train IL policies from them due to the lack of\nrobot action labels. To address this, we propose to represent actions as\nshort-horizon 2D trajectories on an image. These actions, or motion tracks,\ncapture the predicted direction of motion for either human hands or robot\nend-effectors. We instantiate an IL policy called Motion Track Policy (MT-pi)\nwhich receives image observations and outputs motion tracks as actions. By\nleveraging this unified, cross-embodiment action space, MT-pi completes tasks\nwith high success given just minutes of human video and limited additional\nrobot demonstrations. At test time, we predict motion tracks from two camera\nviews, recovering 6DoF trajectories via multi-view synthesis. MT-pi achieves an\naverage success rate of 86.5% across 4 real-world tasks, outperforming\nstate-of-the-art IL baselines which do not leverage human data or our action\nspace by 40%, and generalizes to scenarios seen only in human videos. Code and\nvideos are available on our website\nhttps://portal-cornell.github.io/motion_track_policy/.\n","authors":["Juntao Ren","Priya Sundaresan","Dorsa Sadigh","Sanjiban Choudhury","Jeannette Bohg"],"pdf_url":"https://arxiv.org/pdf/2501.06994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06987v1","updated":"2025-01-13T00:29:57Z","published":"2025-01-13T00:29:57Z","title":"Hand-Object Contact Detection using Grasp Quality Metrics","summary":" We propose a novel hand-object contact detection system based on grasp\nquality metrics extracted from object and hand poses, and evaluated its\nperformance using the DexYCB dataset. Our evaluation demonstrated the system's\nhigh accuracy (approaching 90%). Future work will focus on a real-time\nimplementation using vision-based estimation, and integrating it to a\nrobot-to-human handover system.\n","authors":["Akansel Cosgun","Thanh Vinh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.06987v1.pdf","comment":"Submitted to the 2025 IEEE/ACM International Conference on\n Human-Robot Interaction (HRI'25)"},{"id":"http://arxiv.org/abs/2411.10941v2","updated":"2025-01-13T00:03:58Z","published":"2024-11-17T02:39:58Z","title":"Efficient Estimation of Relaxed Model Parameters for Robust UAV\n Trajectory Optimization","summary":" Online trajectory optimization and optimal control methods are crucial for\nenabling sustainable unmanned aerial vehicle (UAV) services, such as\nagriculture, environmental monitoring, and transportation, where available\nactuation and energy are limited. However, optimal controllers are highly\nsensitive to model mismatch, which can occur due to loaded equipment, packages\nto be delivered, or pre-existing variability in fundamental structural and\nthrust-related parameters. To circumvent this problem, optimal controllers can\nbe paired with parameter estimators to improve their trajectory planning\nperformance and perform adaptive control. However, UAV platforms are limited in\nterms of onboard processing power, oftentimes making nonlinear parameter\nestimation too computationally expensive to consider. To address these issues,\nwe propose a relaxed, affine-in-parameters multirotor model along with an\nefficient optimal parameter estimator. We convexify the nominal Moving Horizon\nParameter Estimation (MHPE) problem into a linear-quadratic form (LQ-MHPE) via\nan affine-in-parameter relaxation on the nonlinear dynamics, resulting in fast\nquadratic programs (QPs) that facilitate adaptive Model Predictve Control (MPC)\nin real time. We compare this approach to the equivalent nonlinear estimator in\nMonte Carlo simulations, demonstrating a decrease in average solve time and\ntrajectory optimality cost by 98.2% and 23.9-56.2%, respectively.\n","authors":["Derek Fan","David A. Copp"],"pdf_url":"https://arxiv.org/pdf/2411.10941v2.pdf","comment":"8 pages, 5 figures, to be published in IEEE Sustech 2025"},{"id":"http://arxiv.org/abs/2501.07713v1","updated":"2025-01-13T21:52:46Z","published":"2025-01-13T21:52:46Z","title":"Testing Human-Hand Segmentation on In-Distribution and\n Out-of-Distribution Data in Human-Robot Interactions Using a Deep Ensemble\n Model","summary":" Reliable detection and segmentation of human hands are critical for enhancing\nsafety and facilitating advanced interactions in human-robot collaboration.\nCurrent research predominantly evaluates hand segmentation under\nin-distribution (ID) data, which reflects the training data of deep learning\n(DL) models. However, this approach fails to address out-of-distribution (OOD)\nscenarios that often arise in real-world human-robot interactions. In this\nstudy, we present a novel approach by evaluating the performance of pre-trained\nDL models under both ID data and more challenging OOD scenarios. To mimic\nrealistic industrial scenarios, we designed a diverse dataset featuring simple\nand cluttered backgrounds with industrial tools, varying numbers of hands (0 to\n4), and hands with and without gloves. For OOD scenarios, we incorporated\nunique and rare conditions such as finger-crossing gestures and motion blur\nfrom fast-moving hands, addressing both epistemic and aleatoric uncertainties.\nTo ensure multiple point of views (PoVs), we utilized both egocentric cameras,\nmounted on the operator's head, and static cameras to capture RGB images of\nhuman-robot interactions. This approach allowed us to account for multiple\ncamera perspectives while also evaluating the performance of models trained on\nexisting egocentric datasets as well as static-camera datasets. For\nsegmentation, we used a deep ensemble model composed of UNet and RefineNet as\nbase learners. Performance evaluation was conducted using segmentation metrics\nand uncertainty quantification via predictive entropy. Results revealed that\nmodels trained on industrial datasets outperformed those trained on\nnon-industrial datasets, highlighting the importance of context-specific\ntraining. Although all models struggled with OOD scenarios, those trained on\nindustrial datasets demonstrated significantly better generalization.\n","authors":["Reza Jalayer","Yuxin Chen","Masoud Jalayer","Carlotta Orsenigo","Masayoshi Tomizuka"],"pdf_url":"https://arxiv.org/pdf/2501.07713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07705v1","updated":"2025-01-13T21:32:42Z","published":"2025-01-13T21:32:42Z","title":"Autonomous Electrochemistry Platform with Real-Time Normality Testing of\n Voltammetry Measurements Using ML","summary":" Electrochemistry workflows utilize various instruments and computing systems\nto execute workflows consisting of electrocatalyst synthesis, testing and\nevaluation tasks. The heterogeneity of the software and hardware of these\necosystems makes it challenging to orchestrate a complete workflow from\nproduction to characterization by automating its tasks. We propose an\nautonomous electrochemistry computing platform for a multi-site ecosystem that\nprovides the services for remote experiment steering, real-time measurement\ntransfer, and AI/ML-driven analytics. We describe the integration of a mobile\nrobot and synthesis workstation into the ecosystem by developing custom\nhub-networks and software modules to support remote operations over the\necosystem's wireless and wired networks. We describe a workflow task for\ngenerating I-V voltammetry measurements using a potentiostat, and a machine\nlearning framework to ensure their normality by detecting abnormal conditions\nsuch as disconnected electrodes. We study a number of machine learning methods\nfor the underlying detection problem, including smooth, non-smooth, structural\nand statistical methods, and their fusers. We present experimental results to\nillustrate the effectiveness of this platform, and also validate the proposed\nML method by deriving its rigorous generalization equations.\n","authors":["Anees Al-Najjar","Nageswara S. V. Rao","Craig A. Bridges","Sheng Dai","Alex Walters"],"pdf_url":"https://arxiv.org/pdf/2501.07705v1.pdf","comment":"10 pages, 14 figures, accepted in the IEEE 20th International\n Conference on e-Science (e-Science), 2024"},{"id":"http://arxiv.org/abs/2403.04917v3","updated":"2025-01-13T20:28:04Z","published":"2024-03-07T22:03:36Z","title":"A Mixed-Integer Conic Program for the Moving-Target Traveling Salesman\n Problem based on a Graph of Convex Sets","summary":" This paper introduces a new formulation that finds the optimum for the\nMoving-Target Traveling Salesman Problem (MT-TSP), which seeks to find a\nshortest path for an agent, that starts at a depot, visits a set of moving\ntargets exactly once within their assigned time-windows, and returns to the\ndepot. The formulation relies on the key idea that when the targets move along\nlines, their trajectories become convex sets within the space-time coordinate\nsystem. The problem then reduces to finding the shortest path within a graph of\nconvex sets, subject to some speed constraints. We compare our formulation with\nthe current state-of-the-art Mixed Integer Conic Program (MICP) solver for the\nMT-TSP. The experimental results show that our formulation outperforms the MICP\nfor instances with up to 20 targets, with up to two orders of magnitude\nreduction in runtime, and up to a 60\\% tighter optimality gap. We also show\nthat the solution cost from the convex relaxation of our formulation provides\nsignificantly tighter lower bounds for the MT-TSP than the ones from the MICP.\n","authors":["Allen George Philip","Zhongqiang Ren","Sivakumar Rathinam","Howie Choset"],"pdf_url":"https://arxiv.org/pdf/2403.04917v3.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2406.02365v5","updated":"2025-01-13T20:06:35Z","published":"2024-06-04T14:43:50Z","title":"Exploiting Chordal Sparsity for Fast Global Optimality with Application\n to Localization","summary":" In recent years, many estimation problems in robotics have been shown to be\nsolvable to global optimality using their semidefinite relaxations. However,\nthe runtime complexity of off-the-shelf semidefinite programming (SDP) solvers\nis up to cubic in problem size, which inhibits real-time solutions of problems\ninvolving large state dimensions. We show that for a large class of problems,\nnamely those with chordal sparsity, we can reduce the complexity of these\nsolvers to linear in problem size. In particular, we show how to replace the\nlarge positive-semidefinite variable with a number of smaller interconnected\nones using the well-known chordal decomposition. This formulation also allows\nfor the straightforward application of the alternating direction method of\nmultipliers (ADMM), which can exploit parallelism for increased scalability. We\nshow for two example problems in simulation that the chordal solvers provide a\nsignificant speed-up over standard SDP solvers, and that global optimality is\ncrucial in the absence of good initializations.\n","authors":["Frederike Dümbgen","Connor Holmes","Timothy D. Barfoot"],"pdf_url":"https://arxiv.org/pdf/2406.02365v5.pdf","comment":"21 pages, 6 figures. Version history: v1: initial arXiv, v2: WAFR\n submission, v3: correction, v4: WAFR conference-ready, v5: WAFR SPAR journal\n version"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.07575v1","updated":"2025-01-13T18:59:48Z","published":"2025-01-13T18:59:48Z","title":"Dataset Distillation via Committee Voting","summary":" Dataset distillation aims to synthesize a smaller, representative dataset\nthat preserves the essential properties of the original data, enabling\nefficient model training with reduced computational resources. Prior work has\nprimarily focused on improving the alignment or matching process between\noriginal and synthetic data, or on enhancing the efficiency of distilling large\ndatasets. In this work, we introduce ${\\bf C}$ommittee ${\\bf V}$oting for ${\\bf\nD}$ataset ${\\bf D}$istillation (CV-DD), a novel and orthogonal approach that\nleverages the collective wisdom of multiple models or experts to create\nhigh-quality distilled datasets. We start by showing how to establish a strong\nbaseline that already achieves state-of-the-art accuracy through leveraging\nrecent advancements and thoughtful adjustments in model design and optimization\nprocesses. By integrating distributions and predictions from a committee of\nmodels while generating high-quality soft labels, our method captures a wider\nspectrum of data features, reduces model-specific biases and the adverse\neffects of distribution shifts, leading to significant improvements in\ngeneralization. This voting-based strategy not only promotes diversity and\nrobustness within the distilled dataset but also significantly reduces\noverfitting, resulting in improved performance on post-eval tasks. Extensive\nexperiments across various datasets and IPCs (images per class) demonstrate\nthat Committee Voting leads to more reliable and adaptable distilled data\ncompared to single/multi-model distillation methods, demonstrating its\npotential for efficient and accurate dataset distillation. Code is available\nat: https://github.com/Jiacheng8/CV-DD.\n","authors":["Jiacheng Cui","Zhaoyi Li","Xiaochen Ma","Xinyue Bi","Yaxin Luo","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2501.07575v1.pdf","comment":"Code at: https://github.com/Jiacheng8/CV-DD"},{"id":"http://arxiv.org/abs/2501.07574v1","updated":"2025-01-13T18:59:20Z","published":"2025-01-13T18:59:20Z","title":"UnCommon Objects in 3D","summary":" We introduce Uncommon Objects in 3D (uCO3D), a new object-centric dataset for\n3D deep learning and 3D generative AI. uCO3D is the largest publicly-available\ncollection of high-resolution videos of objects with 3D annotations that\nensures full-360$^{\\circ}$ coverage. uCO3D is significantly more diverse than\nMVImgNet and CO3Dv2, covering more than 1,000 object categories. It is also of\nhigher quality, due to extensive quality checks of both the collected videos\nand the 3D annotations. Similar to analogous datasets, uCO3D contains\nannotations for 3D camera poses, depth maps and sparse point clouds. In\naddition, each object is equipped with a caption and a 3D Gaussian Splat\nreconstruction. We train several large 3D models on MVImgNet, CO3Dv2, and uCO3D\nand obtain superior results using the latter, showing that uCO3D is better for\nlearning applications.\n","authors":["Xingchen Liu","Piyush Tayal","Jianyuan Wang","Jesus Zarzar","Tom Monnier","Konstantinos Tertikas","Jiali Duan","Antoine Toisoul","Jason Y. Zhang","Natalia Neverova","Andrea Vedaldi","Roman Shapovalov","David Novotny"],"pdf_url":"https://arxiv.org/pdf/2501.07574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07563v1","updated":"2025-01-13T18:53:08Z","published":"2025-01-13T18:53:08Z","title":"Training-Free Motion-Guided Video Generation with Enhanced Temporal\n Consistency Using Motion Consistency Loss","summary":" In this paper, we address the challenge of generating temporally consistent\nvideos with motion guidance. While many existing methods depend on additional\ncontrol modules or inference-time fine-tuning, recent studies suggest that\neffective motion guidance is achievable without altering the model architecture\nor requiring extra training. Such approaches offer promising compatibility with\nvarious video generation foundation models. However, existing training-free\nmethods often struggle to maintain consistent temporal coherence across frames\nor to follow guided motion accurately. In this work, we propose a simple yet\neffective solution that combines an initial-noise-based approach with a novel\nmotion consistency loss, the latter being our key innovation. Specifically, we\ncapture the inter-frame feature correlation patterns of intermediate features\nfrom a video diffusion model to represent the motion pattern of the reference\nvideo. We then design a motion consistency loss to maintain similar feature\ncorrelation patterns in the generated video, using the gradient of this loss in\nthe latent space to guide the generation process for precise motion control.\nThis approach improves temporal consistency across various motion control tasks\nwhile preserving the benefits of a training-free setup. Extensive experiments\nshow that our method sets a new standard for efficient, temporally coherent\nvideo generation.\n","authors":["Xinyu Zhang","Zicheng Duan","Dong Gong","Lingqiao Liu"],"pdf_url":"https://arxiv.org/pdf/2501.07563v1.pdf","comment":"Project page:\n https://zhangxinyu-xyz.github.io/SimulateMotion.github.io/"},{"id":"http://arxiv.org/abs/2501.07556v1","updated":"2025-01-13T18:37:36Z","published":"2025-01-13T18:37:36Z","title":"MatchAnything: Universal Cross-Modality Image Matching with Large-Scale\n Pre-Training","summary":" Image matching, which aims to identify corresponding pixel locations between\nimages, is crucial in a wide range of scientific disciplines, aiding in image\nregistration, fusion, and analysis. In recent years, deep learning-based image\nmatching algorithms have dramatically outperformed humans in rapidly and\naccurately finding large amounts of correspondences. However, when dealing with\nimages captured under different imaging modalities that result in significant\nappearance changes, the performance of these algorithms often deteriorates due\nto the scarcity of annotated cross-modal training data. This limitation hinders\napplications in various fields that rely on multiple image modalities to obtain\ncomplementary information. To address this challenge, we propose a large-scale\npre-training framework that utilizes synthetic cross-modal training signals,\nincorporating diverse data from various sources, to train models to recognize\nand match fundamental structures across images. This capability is transferable\nto real-world, unseen cross-modality image matching tasks. Our key finding is\nthat the matching model trained with our framework achieves remarkable\ngeneralizability across more than eight unseen cross-modality registration\ntasks using the same network weight, substantially outperforming existing\nmethods, whether designed for generalization or tailored for specific tasks.\nThis advancement significantly enhances the applicability of image matching\ntechnologies across various scientific disciplines and paves the way for new\napplications in multi-modality human and artificial intelligence analysis and\nbeyond.\n","authors":["Xingyi He","Hao Yu","Sida Peng","Dongli Tan","Zehong Shen","Hujun Bao","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.07556v1.pdf","comment":"Project page: https://zju3dv.github.io/MatchAnything/"},{"id":"http://arxiv.org/abs/2501.07554v1","updated":"2025-01-13T18:37:08Z","published":"2025-01-13T18:37:08Z","title":"SST-EM: Advanced Metrics for Evaluating Semantic, Spatial and Temporal\n Aspects in Video Editing","summary":" Video editing models have advanced significantly, but evaluating their\nperformance remains challenging. Traditional metrics, such as CLIP text and\nimage scores, often fall short: text scores are limited by inadequate training\ndata and hierarchical dependencies, while image scores fail to assess temporal\nconsistency. We present SST-EM (Semantic, Spatial, and Temporal Evaluation\nMetric), a novel evaluation framework that leverages modern Vision-Language\nModels (VLMs), Object Detection, and Temporal Consistency checks. SST-EM\ncomprises four components: (1) semantic extraction from frames using a VLM, (2)\nprimary object tracking with Object Detection, (3) focused object refinement\nvia an LLM agent, and (4) temporal consistency assessment using a Vision\nTransformer (ViT). These components are integrated into a unified metric with\nweights derived from human evaluations and regression analysis. The name SST-EM\nreflects its focus on Semantic, Spatial, and Temporal aspects of video\nevaluation. SST-EM provides a comprehensive evaluation of semantic fidelity and\ntemporal smoothness in video editing. The source code is available in the\n\\textbf{\\href{https://github.com/custommetrics-sst/SST_CustomEvaluationMetrics.git}{GitHub\nRepository}}.\n","authors":["Varun Biyyala","Bharat Chanderprakash Kathuria","Jialu Li","Youshan Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07554v1.pdf","comment":"WACV workshop"},{"id":"http://arxiv.org/abs/2501.07542v1","updated":"2025-01-13T18:23:57Z","published":"2025-01-13T18:23:57Z","title":"Imagine while Reasoning in Space: Multimodal Visualization-of-Thought","summary":" Chain-of-Thought (CoT) prompting has proven highly effective for enhancing\ncomplex reasoning in Large Language Models (LLMs) and Multimodal Large Language\nModels (MLLMs). Yet, it struggles in complex spatial reasoning tasks.\nNonetheless, human cognition extends beyond language alone, enabling the\nremarkable capability to think in both words and images. Inspired by this\nmechanism, we propose a new reasoning paradigm, Multimodal\nVisualization-of-Thought (MVoT). It enables visual thinking in MLLMs by\ngenerating image visualizations of their reasoning traces. To ensure\nhigh-quality visualization, we introduce token discrepancy loss into\nautoregressive MLLMs. This innovation significantly improves both visual\ncoherence and fidelity. We validate this approach through several dynamic\nspatial reasoning tasks. Experimental results reveal that MVoT demonstrates\ncompetitive performance across tasks. Moreover, it exhibits robust and reliable\nimprovements in the most challenging scenarios where CoT fails. Ultimately,\nMVoT establishes new possibilities for complex reasoning tasks where visual\nthinking can effectively complement verbal reasoning.\n","authors":["Chengzu Li","Wenshan Wu","Huanyu Zhang","Yan Xia","Shaoguang Mao","Li Dong","Ivan Vulić","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2501.07542v1.pdf","comment":"11 pages, 6 figures, 4 tables (27 pages, 10 figures, 16 tables\n including references and appendices)"},{"id":"http://arxiv.org/abs/2411.11222v2","updated":"2025-01-13T18:20:35Z","published":"2024-11-18T01:19:37Z","title":"The Sound of Water: Inferring Physical Properties from Pouring Liquids","summary":" We study the connection between audio-visual observations and the underlying\nphysics of a mundane yet intriguing everyday activity: pouring liquids. Given\nonly the sound of liquid pouring into a container, our objective is to\nautomatically infer physical properties such as the liquid level, the shape and\nsize of the container, the pouring rate and the time to fill. To this end, we:\n(i) show in theory that these properties can be determined from the fundamental\nfrequency (pitch); (ii) train a pitch detection model with supervision from\nsimulated data and visual data with a physics-inspired objective; (iii)\nintroduce a new large dataset of real pouring videos for a systematic study;\n(iv) show that the trained model can indeed infer these physical properties for\nreal data; and finally, (v) we demonstrate strong generalization to various\ncontainer shapes, other datasets, and in-the-wild YouTube videos. Our work\npresents a keen understanding of a narrow yet rich problem at the intersection\nof acoustics, physics, and learning. It opens up applications to enhance\nmultisensory perception in robotic pouring.\n","authors":["Piyush Bagad","Makarand Tapaswi","Cees G. M. Snoek","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2411.11222v2.pdf","comment":"Project page at https://bpiyush.github.io/pouring-water-website.\n Short version accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2302.04850v3","updated":"2025-01-13T18:18:24Z","published":"2023-02-09T18:53:44Z","title":"Robot Synesthesia: A Sound and Emotion Guided AI Painter","summary":" If a picture paints a thousand words, sound may voice a million. While recent\nrobotic painting and image synthesis methods have achieved progress in\ngenerating visuals from text inputs, the translation of sound into images is\nvastly unexplored. Generally, sound-based interfaces and sonic interactions\nhave the potential to expand accessibility and control for the user and provide\na means to convey complex emotions and the dynamic aspects of the real world.\nIn this paper, we propose an approach for using sound and speech to guide a\nrobotic painting process, known here as robot synesthesia. For general sound,\nwe encode the simulated paintings and input sounds into the same latent space.\nFor speech, we decouple speech into its transcribed text and the tone of the\nspeech. Whereas we use the text to control the content, we estimate the\nemotions from the tone to guide the mood of the painting. Our approach has been\nfully integrated with FRIDA, a robotic painting framework, adding sound and\nspeech to FRIDA's existing input modalities, such as text and style. In two\nsurveys, participants were able to correctly guess the emotion or natural sound\nused to generate a given painting more than twice as likely as random chance.\nOn our sound-guided image manipulation and music-guided paintings, we discuss\nthe results qualitatively.\n","authors":["Vihaan Misra","Peter Schaldenbrand","Jean Oh"],"pdf_url":"https://arxiv.org/pdf/2302.04850v3.pdf","comment":"9 pages, 10 figures"},{"id":"http://arxiv.org/abs/2306.11207v4","updated":"2025-01-13T18:16:34Z","published":"2023-06-20T00:14:47Z","title":"Quilt-1M: One Million Image-Text Pairs for Histopathology","summary":" Recent accelerations in multi-modal applications have been made possible with\nthe plethora of image and text data available online. However, the scarcity of\nanalogous data in the medical field, specifically in histopathology, has slowed\ncomparable progress. To enable similar representation learning for\nhistopathology, we turn to YouTube, an untapped resource of videos, offering\n$1,087$ hours of valuable educational histopathology videos from expert\nclinicians. From YouTube, we curate QUILT: a large-scale vision-language\ndataset consisting of $802, 144$ image and text pairs. QUILT was automatically\ncurated using a mixture of models, including large language models, handcrafted\nalgorithms, human knowledge databases, and automatic speech recognition. In\ncomparison, the most comprehensive datasets curated for histopathology amass\nonly around $200$K samples. We combine QUILT with datasets from other sources,\nincluding Twitter, research papers, and the internet in general, to create an\neven larger dataset: QUILT-1M, with $1$M paired image-text samples, marking it\nas the largest vision-language histopathology dataset to date. We demonstrate\nthe value of QUILT-1M by fine-tuning a pre-trained CLIP model. Our model\noutperforms state-of-the-art models on both zero-shot and linear probing tasks\nfor classifying new histopathology images across $13$ diverse patch-level\ndatasets of $8$ different sub-pathologies and cross-modal retrieval tasks.\n","authors":["Wisdom Oluchi Ikezogwo","Mehmet Saygin Seyfioglu","Fatemeh Ghezloo","Dylan Stefan Chan Geva","Fatwir Sheikh Mohammed","Pavan Kumar Anand","Ranjay Krishna","Linda Shapiro"],"pdf_url":"https://arxiv.org/pdf/2306.11207v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07533v1","updated":"2025-01-13T18:10:19Z","published":"2025-01-13T18:10:19Z","title":"Confident Pseudo-labeled Diffusion Augmentation for Canine Cardiomegaly\n Detection","summary":" Canine cardiomegaly, marked by an enlarged heart, poses serious health risks\nif undetected, requiring accurate diagnostic methods. Current detection models\noften rely on small, poorly annotated datasets and struggle to generalize\nacross diverse imaging conditions, limiting their real-world applicability. To\naddress these issues, we propose a Confident Pseudo-labeled Diffusion\nAugmentation (CDA) model for identifying canine cardiomegaly. Our approach\naddresses the challenge of limited high-quality training data by employing\ndiffusion models to generate synthetic X-ray images and annotate Vertebral\nHeart Score key points, thereby expanding the dataset. We also employ a\npseudo-labeling strategy with Monte Carlo Dropout to select high-confidence\nlabels, refine the synthetic dataset, and improve accuracy. Iteratively\nincorporating these labels enhances the model's performance, overcoming the\nlimitations of existing approaches. Experimental results show that the CDA\nmodel outperforms traditional methods, achieving state-of-the-art accuracy in\ncanine cardiomegaly detection. The code implementation is available at\nhttps://github.com/Shira7z/CDA.\n","authors":["Shiman Zhang","Lakshmikar Reddy Polamreddy","Youshan Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07533v1.pdf","comment":"WACV workshop"},{"id":"http://arxiv.org/abs/2501.07530v1","updated":"2025-01-13T18:08:27Z","published":"2025-01-13T18:08:27Z","title":"IP-FaceDiff: Identity-Preserving Facial Video Editing with Diffusion","summary":" Facial video editing has become increasingly important for content creators,\nenabling the manipulation of facial expressions and attributes. However,\nexisting models encounter challenges such as poor editing quality, high\ncomputational costs and difficulties in preserving facial identity across\ndiverse edits. Additionally, these models are often constrained to editing\npredefined facial attributes, limiting their flexibility to diverse editing\nprompts. To address these challenges, we propose a novel facial video editing\nframework that leverages the rich latent space of pre-trained text-to-image\n(T2I) diffusion models and fine-tune them specifically for facial video editing\ntasks. Our approach introduces a targeted fine-tuning scheme that enables high\nquality, localized, text-driven edits while ensuring identity preservation\nacross video frames. Additionally, by using pre-trained T2I models during\ninference, our approach significantly reduces editing time by 80%, while\nmaintaining temporal consistency throughout the video sequence. We evaluate the\neffectiveness of our approach through extensive testing across a wide range of\nchallenging scenarios, including varying head poses, complex action sequences,\nand diverse facial expressions. Our method consistently outperforms existing\ntechniques, demonstrating superior performance across a broad set of metrics\nand benchmarks.\n","authors":["Tharun Anand","Aryan Garg","Kaushik Mitra"],"pdf_url":"https://arxiv.org/pdf/2501.07530v1.pdf","comment":"WACV-25 Workshop"},{"id":"http://arxiv.org/abs/2402.16865v3","updated":"2025-01-13T18:06:23Z","published":"2024-01-21T04:14:54Z","title":"Enhance Eye Disease Detection using Learnable Probabilistic Discrete\n Latents in Machine Learning Architectures","summary":" Ocular diseases, including diabetic retinopathy and glaucoma, present a\nsignificant public health challenge due to their high prevalence and potential\nfor causing vision impairment. Early and accurate diagnosis is crucial for\neffective treatment and management. In recent years, deep learning models have\nemerged as powerful tools for analysing medical images, such as retina imaging.\nHowever, challenges persist in model relibability and uncertainty estimation,\nwhich are critical for clinical decision-making. This study leverages the\nprobabilistic framework of Generative Flow Networks (GFlowNets) to learn the\nposterior distribution over latent discrete dropout masks for the\nclassification and analysis of ocular diseases using fundus images. We develop\na robust and generalizable method that utilizes GFlowOut integrated with\nResNet18 and ViT models as the backbone in identifying various ocular\nconditions. This study employs a unique set of dropout masks - none, random,\nbottomup, and topdown - to enhance model performance in analyzing these fundus\nimages. Our results demonstrate that our learnable probablistic latents\nsignificantly improves accuracy, outperforming the traditional dropout\napproach. We utilize a gradient map calculation method, Grad-CAM, to assess\nmodel explainability, observing that the model accurately focuses on critical\nimage regions for predictions. The integration of GFlowOut in neural networks\npresents a promising advancement in the automated diagnosis of ocular diseases,\nwith implications for improving clinical workflows and patient outcomes.\n","authors":["Anirudh Prabhakaran","YeKun Xiao","Ching-Yu Cheng","Dianbo Liu"],"pdf_url":"https://arxiv.org/pdf/2402.16865v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07525v1","updated":"2025-01-13T17:55:32Z","published":"2025-01-13T17:55:32Z","title":"RadAlign: Advancing Radiology Report Generation with Vision-Language\n Concept Alignment","summary":" Automated chest radiographs interpretation requires both accurate disease\nclassification and detailed radiology report generation, presenting a\nsignificant challenge in the clinical workflow. Current approaches either focus\non classification accuracy at the expense of interpretability or generate\ndetailed but potentially unreliable reports through image captioning\ntechniques. In this study, we present RadAlign, a novel framework that combines\nthe predictive accuracy of vision-language models (VLMs) with the reasoning\ncapabilities of large language models (LLMs). Inspired by the radiologist's\nworkflow, RadAlign first employs a specialized VLM to align visual features\nwith key medical concepts, achieving superior disease classification with an\naverage AUC of 0.885 across multiple diseases. These recognized medical\nconditions, represented as text-based concepts in the aligned visual-language\nspace, are then used to prompt LLM-based report generation. Enhanced by a\nretrieval-augmented generation mechanism that grounds outputs in similar\nhistorical cases, RadAlign delivers superior report quality with a GREEN score\nof 0.678, outperforming state-of-the-art methods' 0.634. Our framework\nmaintains strong clinical interpretability while reducing hallucinations,\nadvancing automated medical imaging and report analysis through integrated\npredictive and generative AI. Code is available at\nhttps://github.com/difeigu/RadAlign.\n","authors":["Difei Gu","Yunhe Gao","Yang Zhou","Mu Zhou","Dimitris Metaxas"],"pdf_url":"https://arxiv.org/pdf/2501.07525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11677v2","updated":"2025-01-13T17:45:59Z","published":"2023-04-23T15:09:02Z","title":"RGB-D Indiscernible Object Counting in Underwater Scenes","summary":" Recently, indiscernible/camouflaged scene understanding has attracted lots of\nresearch attention in the vision community. We further advance the frontier of\nthis field by systematically studying a new challenge named indiscernible\nobject counting (IOC), the goal of which is to count objects that are blended\nwith respect to their surroundings. Due to a lack of appropriate IOC datasets,\nwe present a large-scale dataset IOCfish5K which contains a total of 5,637\nhigh-resolution images and 659,024 annotated center points. Our dataset\nconsists of a large number of indiscernible objects (mainly fish) in underwater\nscenes, making the annotation process all the more challenging. IOCfish5K is\nsuperior to existing datasets with indiscernible scenes because of its larger\nscale, higher image resolutions, more annotations, and denser scenes. All these\naspects make it the most challenging dataset for IOC so far, supporting\nprogress in this area. Benefiting from the recent advancements of depth\nestimation foundation models, we construct high-quality depth maps for\nIOCfish5K by generating pseudo labels using the Depth Anything V2 model. The\nRGB-D version of IOCfish5K is named IOCfish5K-D. For benchmarking purposes on\nIOCfish5K, we select 14 mainstream methods for object counting and carefully\nevaluate them. For multimodal IOCfish5K-D, we evaluate other 4 popular\nmultimodal counting methods. Furthermore, we propose IOCFormer, a new strong\nbaseline that combines density and regression branches in a unified framework\nand can effectively tackle object counting under concealed scenes. We also\npropose IOCFormer-D to enable the effective usage of depth modality in helping\ndetect and count objects hidden in their environments. Experiments show that\nIOCFormer and IOCFormer-D achieve state-of-the-art scores on IOCfish5K and\nIOCfish5K-D, respectively.\n","authors":["Guolei Sun","Xiaogang Cheng","Zhaochong An","Xiaokang Wang","Yun Liu","Deng-Ping Fan","Ming-Ming Cheng","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2304.11677v2.pdf","comment":"Journal version. The resources are available at\n https://github.com/GuoleiSun/Indiscernible-Object-Counting"},{"id":"http://arxiv.org/abs/2406.04158v3","updated":"2025-01-13T17:44:43Z","published":"2024-06-06T15:18:59Z","title":"CMAR-Net: Accurate Cross-Modal 3D SAR Reconstruction of Vehicle Targets\n with Sparse Multi-Baseline Data","summary":" Multi-baseline Synthetic Aperture Radar (SAR) three-dimensional (3D)\ntomography is a crucial remote sensing technique that provides 3D resolution\nunavailable in conventional SAR imaging. However, achieving high-quality\nimaging typically requires multi-angle or full-aperture data, resulting in\nsignificant imaging costs. Recent advancements in sparse 3D SAR, which rely on\ndata from limited apertures, have gained attention as a cost-effective\nalternative. Notably, deep learning techniques have markedly enhanced the\nimaging quality of sparse 3D SAR. Despite these advancements, existing methods\nprimarily depend on high-resolution radar images for supervising the training\nof deep neural networks (DNNs). This exclusive dependence on single-modal data\nprevents the introduction of complementary information from other data sources,\nlimiting further improvements in imaging performance. In this paper, we\nintroduce a Cross-Modal 3D-SAR Reconstruction Network (CMAR-Net) to enhance 3D\nSAR imaging by integrating heterogeneous information. Leveraging cross-modal\nsupervision from 2D optical images and error transfer guaranteed by\ndifferentiable rendering, CMAR-Net achieves efficient training and reconstructs\nhighly sparse multi-baseline SAR data into visually structured and accurate 3D\nimages, particularly for vehicle targets. Extensive experiments on simulated\nand real-world datasets demonstrate that CMAR-Net significantly outperforms\nSOTA sparse reconstruction algorithms based on compressed sensing (CS) and deep\nlearning (DL). Furthermore, our method eliminates the need for time-consuming\nfull-aperture data preprocessing and relies solely on computer-rendered optical\nimages, significantly reducing dataset construction costs. This work highlights\nthe potential of deep learning for multi-baseline SAR 3D imaging and introduces\na novel framework for radar imaging research through cross-modal learning.\n","authors":["Da Li","Guoqiang Zhao","Houjun Sun","Jiacheng Bao"],"pdf_url":"https://arxiv.org/pdf/2406.04158v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05379v2","updated":"2025-01-13T17:22:30Z","published":"2025-01-09T17:04:33Z","title":"Arc2Avatar: Generating Expressive 3D Avatars from a Single Image via ID\n Guidance","summary":" Inspired by the effectiveness of 3D Gaussian Splatting (3DGS) in\nreconstructing detailed 3D scenes within multi-view setups and the emergence of\nlarge 2D human foundation models, we introduce Arc2Avatar, the first SDS-based\nmethod utilizing a human face foundation model as guidance with just a single\nimage as input. To achieve that, we extend such a model for diverse-view human\nhead generation by fine-tuning on synthetic data and modifying its\nconditioning. Our avatars maintain a dense correspondence with a human face\nmesh template, allowing blendshape-based expression generation. This is\nachieved through a modified 3DGS approach, connectivity regularizers, and a\nstrategic initialization tailored for our task. Additionally, we propose an\noptional efficient SDS-based correction step to refine the blendshape\nexpressions, enhancing realism and diversity. Experiments demonstrate that\nArc2Avatar achieves state-of-the-art realism and identity preservation,\neffectively addressing color issues by allowing the use of very low guidance,\nenabled by our strong identity prior and initialization strategy, without\ncompromising detail. Please visit https://arc2avatar.github.io for more\nresources.\n","authors":["Dimitrios Gerogiannis","Foivos Paraperas Papantoniou","Rolandos Alexandros Potamias","Alexandros Lattas","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2501.05379v2.pdf","comment":"Project Page https://arc2avatar.github.io"},{"id":"http://arxiv.org/abs/2501.07499v1","updated":"2025-01-13T17:17:17Z","published":"2025-01-13T17:17:17Z","title":"Three-view Focal Length Recovery From Homographies","summary":" In this paper, we propose a novel approach for recovering focal lengths from\nthree-view homographies. By examining the consistency of normal vectors between\ntwo homographies, we derive new explicit constraints between the focal lengths\nand homographies using an elimination technique. We demonstrate that three-view\nhomographies provide two additional constraints, enabling the recovery of one\nor two focal lengths. We discuss four possible cases, including three cameras\nhaving an unknown equal focal length, three cameras having two different\nunknown focal lengths, three cameras where one focal length is known, and the\nother two cameras have equal or different unknown focal lengths. All the\nproblems can be converted into solving polynomials in one or two unknowns,\nwhich can be efficiently solved using Sturm sequence or hidden variable\ntechnique. Evaluation using both synthetic and real data shows that the\nproposed solvers are both faster and more accurate than methods relying on\nexisting two-view solvers. The code and data are available on\nhttps://github.com/kocurvik/hf\n","authors":["Yaqing Ding","Viktor Kocur","Zuzana Berger Haladová","Qianliang Wu","Shen Cai","Jian Yang","Zuzana Kukelova"],"pdf_url":"https://arxiv.org/pdf/2501.07499v1.pdf","comment":"Code available at https://github.com/kocurvik/hf Dataset available\n at: https://doi.org/10.5281/zenodo.14638904"},{"id":"http://arxiv.org/abs/2401.10815v2","updated":"2025-01-13T17:14:52Z","published":"2024-01-19T17:02:17Z","title":"RAD-DINO: Exploring Scalable Medical Image Encoders Beyond Text\n Supervision","summary":" Language-supervised pre-training has proven to be a valuable method for\nextracting semantically meaningful features from images, serving as a\nfoundational element in multimodal systems within the computer vision and\nmedical imaging domains. However, the computed features are limited by the\ninformation contained in the text, which is particularly problematic in medical\nimaging, where the findings described by radiologists focus on specific\nobservations. This challenge is compounded by the scarcity of paired\nimaging-text data due to concerns over leakage of personal health information.\nIn this work, we fundamentally challenge the prevailing reliance on language\nsupervision for learning general-purpose biomedical imaging encoders. We\nintroduce RAD-DINO, a biomedical image encoder pre-trained solely on unimodal\nbiomedical imaging data that obtains similar or greater performance than\nstate-of-the-art biomedical language-supervised models on a diverse range of\nbenchmarks. Specifically, the quality of learned representations is evaluated\non standard imaging tasks (classification and semantic segmentation), and a\nvision-language alignment task (text report generation from images). To further\ndemonstrate the drawback of language supervision, we show that features from\nRAD-DINO correlate with other medical records (e.g., sex or age) better than\nlanguage-supervised models, which are generally not mentioned in radiology\nreports. Finally, we conduct a series of ablations determining the factors in\nRAD-DINO's performance; notably, we observe that RAD-DINO's downstream\nperformance scales well with the quantity and diversity of training data,\ndemonstrating that image-only supervision is a scalable approach for training a\nfoundational biomedical image encoder. Model weights of RAD-DINO trained on\npublicly available datasets are available at\nhttps://huggingface.co/microsoft/rad-dino.\n","authors":["Fernando Pérez-García","Harshita Sharma","Sam Bond-Taylor","Kenza Bouzid","Valentina Salvatelli","Maximilian Ilse","Shruthi Bannur","Daniel C. Castro","Anton Schwaighofer","Matthew P. Lungren","Maria Wetscherek","Noel Codella","Stephanie L. Hyland","Javier Alvarez-Valle","Ozan Oktay"],"pdf_url":"https://arxiv.org/pdf/2401.10815v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07496v1","updated":"2025-01-13T17:14:25Z","published":"2025-01-13T17:14:25Z","title":"Aligning First, Then Fusing: A Novel Weakly Supervised Multimodal\n Violence Detection Method","summary":" Weakly supervised violence detection refers to the technique of training\nmodels to identify violent segments in videos using only video-level labels.\nAmong these approaches, multimodal violence detection, which integrates\nmodalities such as audio and optical flow, holds great potential. Existing\nmethods in this domain primarily focus on designing multimodal fusion models to\naddress modality discrepancies. In contrast, we take a different approach;\nleveraging the inherent discrepancies across modalities in violence event\nrepresentation to propose a novel multimodal semantic feature alignment method.\nThis method sparsely maps the semantic features of local, transient, and less\ninformative modalities ( such as audio and optical flow ) into the more\ninformative RGB semantic feature space. Through an iterative process, the\nmethod identifies the suitable no-zero feature matching subspace and aligns the\nmodality-specific event representations based on this subspace, enabling the\nfull exploitation of information from all modalities during the subsequent\nmodality fusion stage. Building on this, we design a new weakly supervised\nviolence detection framework that consists of unimodal multiple-instance\nlearning for extracting unimodal semantic features, multimodal alignment,\nmultimodal fusion, and final detection. Experimental results on benchmark\ndatasets demonstrate the effectiveness of our method, achieving an average\nprecision (AP) of 86.07% on the XD-Violence dataset. Our code is available at\nhttps://github.com/xjpp2016/MAVD.\n","authors":["Wenping Jin","Li Zhu","Jing Sun"],"pdf_url":"https://arxiv.org/pdf/2501.07496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01541v2","updated":"2025-01-13T16:55:29Z","published":"2024-09-03T02:18:45Z","title":"Agentic Copyright Watermarking against Adversarial Evidence Forgery with\n Purification-Agnostic Curriculum Proxy Learning","summary":" With the proliferation of AI agents in various domains, protecting the\nownership of AI models has become crucial due to the significant investment in\ntheir development. Unauthorized use and illegal distribution of these models\npose serious threats to intellectual property, necessitating effective\ncopyright protection measures. Model watermarking has emerged as a key\ntechnique to address this issue, embedding ownership information within models\nto assert rightful ownership during copyright disputes. This paper presents\nseveral contributions to model watermarking: a self-authenticating black-box\nwatermarking protocol using hash techniques, a study on evidence forgery\nattacks using adversarial perturbations, a proposed defense involving a\npurification step to counter adversarial attacks, and a purification-agnostic\ncurriculum proxy learning method to enhance watermark robustness and model\nperformance. Experimental results demonstrate the effectiveness of these\napproaches in improving the security, reliability, and performance of\nwatermarked models.\n","authors":["Erjin Bao","Ching-Chun Chang","Hanrui Wang","Isao Echizen"],"pdf_url":"https://arxiv.org/pdf/2409.01541v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07478v1","updated":"2025-01-13T16:52:28Z","published":"2025-01-13T16:52:28Z","title":"3DGS-to-PC: Convert a 3D Gaussian Splatting Scene into a Dense Point\n Cloud or Mesh","summary":" 3D Gaussian Splatting (3DGS) excels at producing highly detailed 3D\nreconstructions, but these scenes often require specialised renderers for\neffective visualisation. In contrast, point clouds are a widely used 3D\nrepresentation and are compatible with most popular 3D processing software, yet\nconverting 3DGS scenes into point clouds is a complex challenge. In this work\nwe introduce 3DGS-to-PC, a flexible and highly customisable framework that is\ncapable of transforming 3DGS scenes into dense, high-accuracy point clouds. We\nsample points probabilistically from each Gaussian as a 3D density function. We\nadditionally threshold new points using the Mahalanobis distance to the\nGaussian centre, preventing extreme outliers. The result is a point cloud that\nclosely represents the shape encoded into the 3D Gaussian scene. Individual\nGaussians use spherical harmonics to adapt colours depending on view, and each\npoint may contribute only subtle colour hints to the resulting rendered scene.\nTo avoid spurious or incorrect colours that do not fit with the final point\ncloud, we recalculate Gaussian colours via a customised image rendering\napproach, assigning each Gaussian the colour of the pixel to which it\ncontributes most across all views. 3DGS-to-PC also supports mesh generation\nthrough Poisson Surface Reconstruction, applied to points sampled from\npredicted surface Gaussians. This allows coloured meshes to be generated from\n3DGS scenes without the need for re-training. This package is highly\ncustomisable and capability of simple integration into existing 3DGS pipelines.\n3DGS-to-PC provides a powerful tool for converting 3DGS data into point cloud\nand surface-based formats.\n","authors":["Lewis A G Stuart","Michael P Pound"],"pdf_url":"https://arxiv.org/pdf/2501.07478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03033v2","updated":"2025-01-13T16:42:03Z","published":"2024-11-05T12:10:02Z","title":"Rethinking Decoders for Transformer-based Semantic Segmentation: A\n Compression Perspective","summary":" State-of-the-art methods for Transformer-based semantic segmentation\ntypically adopt Transformer decoders that are used to extract additional\nembeddings from image embeddings via cross-attention, refine either or both\ntypes of embeddings via self-attention, and project image embeddings onto the\nadditional embeddings via dot-product. Despite their remarkable success, these\nempirical designs still lack theoretical justifications or interpretations,\nthus hindering potentially principled improvements. In this paper, we argue\nthat there are fundamental connections between semantic segmentation and\ncompression, especially between the Transformer decoders and Principal\nComponent Analysis (PCA). From such a perspective, we derive a white-box, fully\nattentional DEcoder for PrIncipled semantiC segemenTation (DEPICT), with the\ninterpretations as follows: 1) the self-attention operator refines image\nembeddings to construct an ideal principal subspace that aligns with the\nsupervision and retains most information; 2) the cross-attention operator seeks\nto find a low-rank approximation of the refined image embeddings, which is\nexpected to be a set of orthonormal bases of the principal subspace and\ncorresponds to the predefined classes; 3) the dot-product operation yields\ncompact representation for image embeddings as segmentation masks. Experiments\nconducted on dataset ADE20K find that DEPICT consistently outperforms its\nblack-box counterpart, Segmenter, and it is light weight and more robust.\n","authors":["Qishuai Wen","Chun-Guang Li"],"pdf_url":"https://arxiv.org/pdf/2411.03033v2.pdf","comment":"NeurIPS2024. Code:https://github.com/QishuaiWen/DEPICT/"},{"id":"http://arxiv.org/abs/2410.00982v2","updated":"2025-01-13T16:27:06Z","published":"2024-10-01T18:10:23Z","title":"ScVLM: Enhancing Vision-Language Model for Safety-Critical Event\n Understanding","summary":" Accurately identifying, understanding and describing traffic safety-critical\nevents (SCEs), including crashes, tire strikes, and near-crashes, is crucial\nfor advanced driver assistance systems, automated driving systems, and traffic\nsafety. As SCEs are rare events, most general vision-language models (VLMs)\nhave not been trained sufficiently to link SCE videos and narratives, which\ncould lead to hallucinations and missing key safety characteristics. Here, we\nintroduce ScVLM, a novel hybrid methodology that integrates supervised and\ncontrastive learning techniques to classify the severity and types of SCEs, as\nwell as to generate narrative descriptions of SCEs. This approach utilizes\nclassification to enhance VLMs' comprehension of driving videos and improve the\nrationality of event descriptions. The proposed approach is trained on and\nevaluated by more than 8,600 SCEs from the Second Strategic Highway Research\nProgram Naturalistic Driving Study dataset, the largest publicly accessible\ndriving dataset with videos and SCE annotations. The results demonstrate the\nsuperiority of the proposed approach in generating contextually accurate event\ndescriptions and mitigating VLM hallucinations. The code will be available at\nhttps://github.com/datadrivenwheels/ScVLM.\n","authors":["Liang Shi","Boyu Jiang","Tong Zeng","Feng Guo"],"pdf_url":"https://arxiv.org/pdf/2410.00982v2.pdf","comment":"To appear in Proceedings of the IEEE/CVF Winter Conference on\n Applications of Computer Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2501.07451v1","updated":"2025-01-13T16:24:49Z","published":"2025-01-13T16:24:49Z","title":"A Survey on Dynamic Neural Networks: from Computer Vision to Multi-modal\n Sensor Fusion","summary":" Model compression is essential in the deployment of large Computer Vision\nmodels on embedded devices. However, static optimization techniques (e.g.\npruning, quantization, etc.) neglect the fact that different inputs have\ndifferent complexities, thus requiring different amount of computations.\nDynamic Neural Networks allow to condition the number of computations to the\nspecific input. The current literature on the topic is very extensive and\nfragmented. We present a comprehensive survey that synthesizes and unifies\nexisting Dynamic Neural Networks research in the context of Computer Vision.\nAdditionally, we provide a logical taxonomy based on which component of the\nnetwork is adaptive: the output, the computation graph or the input.\nFurthermore, we argue that Dynamic Neural Networks are particularly beneficial\nin the context of Sensor Fusion for better adaptivity, noise reduction and\ninformation prioritization. We present preliminary works in this direction.\n","authors":["Fabio Montello","Ronja Güldenring","Simone Scardapane","Lazaros Nalpantidis"],"pdf_url":"https://arxiv.org/pdf/2501.07451v1.pdf","comment":"Under review at International Journal of Computer Vision"},{"id":"http://arxiv.org/abs/2402.13699v5","updated":"2025-01-13T16:21:58Z","published":"2024-02-21T11:00:23Z","title":"Automation of Quantum Dot Measurement Analysis via Explainable Machine\n Learning","summary":" The rapid development of quantum dot (QD) devices for quantum computing has\nnecessitated more efficient and automated methods for device characterization\nand tuning. This work demonstrates the feasibility and advantages of applying\nexplainable machine learning techniques to the analysis of quantum dot\nmeasurements, paving the way for further advances in automated and transparent\nQD device tuning. Many of the measurements acquired during the tuning process\ncome in the form of images that need to be properly analyzed to guide the\nsubsequent tuning steps. By design, features present in such images capture\ncertain behaviors or states of the measured QD devices. When considered\ncarefully, such features can aid the control and calibration of QD devices. An\nimportant example of such images are so-called $\\textit{triangle plots}$, which\nvisually represent current flow and reveal characteristics important for QD\ndevice calibration. While image-based classification tools, such as\nconvolutional neural networks (CNNs), can be used to verify whether a given\nmeasurement is $\\textit{good}$ and thus warrants the initiation of the next\nphase of tuning, they do not provide any insights into how the device should be\nadjusted in the case of $\\textit{bad}$ images. This is because CNNs sacrifice\nprediction and model intelligibility for high accuracy. To ameliorate this\ntrade-off, a recent study introduced an image vectorization approach that\nrelies on the Gabor wavelet transform (Schug $\\textit{et al.}$ 2024\n$\\textit{Proc. XAI4Sci: Explainable Machine Learning for Sciences Workshop\n(AAAI 2024) (Vancouver, Canada)}$ pp 1-6). Here we propose an alternative\nvectorization method that involves mathematical modeling of synthetic triangles\nto mimic the experimental data. Using explainable boosting machines, we show\nthat this new method offers superior explainability of model prediction without\nsacrificing accuracy.\n","authors":["Daniel Schug","Tyler J. Kovach","M. A. Wolfe","Jared Benson","Sanghyeok Park","J. P. Dodson","J. Corrigan","M. A. Eriksson","Justyna P. Zwolak"],"pdf_url":"https://arxiv.org/pdf/2402.13699v5.pdf","comment":"20 pages, 5 figures, abbreviated version published in Proceedings of\n the XAI4Sci: Explainable machine learning for sciences workshop at AAAI 2024,\n (Vancouver, Canada)"},{"id":"http://arxiv.org/abs/2501.07447v1","updated":"2025-01-13T16:18:31Z","published":"2025-01-13T16:18:31Z","title":"PrecipDiff: Leveraging image diffusion models to enhance satellite-based\n precipitation observations","summary":" A recent report from the World Meteorological Organization (WMO) highlights\nthat water-related disasters have caused the highest human losses among natural\ndisasters over the past 50 years, with over 91\\% of deaths occurring in\nlow-income countries. This disparity is largely due to the lack of adequate\nground monitoring stations, such as weather surveillance radars (WSR), which\nare expensive to install. For example, while the US and Europe combined possess\nover 600 WSRs, Africa, despite having almost one and half times their landmass,\nhas fewer than 40. To address this issue, satellite-based observations offer a\nglobal, near-real-time monitoring solution. However, they face several\nchallenges like accuracy, bias, and low spatial resolution. This study\nleverages the power of diffusion models and residual learning to address these\nlimitations in a unified framework. We introduce the first diffusion model for\ncorrecting the inconsistency between different precipitation products. Our\nmethod demonstrates the effectiveness in downscaling satellite precipitation\nestimates from 10 km to 1 km resolution. Extensive experiments conducted in the\nSeattle region demonstrate significant improvements in accuracy, bias\nreduction, and spatial detail. Importantly, our approach achieves these results\nusing only precipitation data, showcasing the potential of a purely computer\nvision-based approach for enhancing satellite precipitation products and paving\nthe way for further advancements in this domain.\n","authors":["Ting-Yu Dai","Hayato Ushijima-Mwesigwa"],"pdf_url":"https://arxiv.org/pdf/2501.07447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01246v3","updated":"2025-01-13T16:07:46Z","published":"2024-12-02T08:06:14Z","title":"Class Distance Weighted Cross Entropy Loss for Classification of Disease\n Severity","summary":" Assessing disease severity with ordinal classes, where each class reflects\nincreasing severity levels, benefits from loss functions designed for this\nordinal structure. Traditional categorical loss functions, like Cross-Entropy\n(CE), often perform suboptimally in these scenarios. To address this, we\npropose a novel loss function, Class Distance Weighted Cross-Entropy (CDW-CE),\nwhich penalizes misclassifications more severely when the predicted and actual\nclasses are farther apart. We evaluated CDW-CE using various deep\narchitectures, comparing its performance against several categorical and\nordinal loss functions. To assess the quality of latent representations, we\nused t-distributed stochastic neighbor embedding (t-SNE) and uniform manifold\napproximation and projection (UMAP) visualizations, quantified the clustering\nquality using the Silhouette Score, and compared Class Activation Maps (CAM)\ngenerated by models trained with CDW-CE and CE loss. Feedback from domain\nexperts was incorporated to evaluate how well model attention aligns with\nexpert opinion. Our results show that CDW-CE consistently improves performance\nin ordinal image classification tasks. It achieves higher Silhouette Scores,\nindicating better class discrimination capability, and its CAM visualizations\nshow a stronger focus on clinically significant regions, as validated by domain\nexperts. Receiver operator characteristics (ROC) curves and the area under the\ncurve (AUC) scores highlight that CDW-CE outperforms other loss functions,\nincluding prominent ordinal loss functions from the literature.\n","authors":["Gorkem Polat","Ümit Mert Çağlar","Alptekin Temizel"],"pdf_url":"https://arxiv.org/pdf/2412.01246v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07434v1","updated":"2025-01-13T16:02:33Z","published":"2025-01-13T16:02:33Z","title":"Guided SAM: Label-Efficient Part Segmentation","summary":" Localizing object parts precisely is essential for tasks such as object\nrecognition and robotic manipulation. Recent part segmentation methods require\nextensive training data and labor-intensive annotations. Segment-Anything Model\n(SAM) has demonstrated good performance on a wide range of segmentation\nproblems, but requires (manual) positional prompts to guide it where to\nsegment. Furthermore, since it has been trained on full objects instead of\nobject parts, it is prone to over-segmentation of parts. To address this, we\npropose a novel approach that guides SAM towards the relevant object parts. Our\nmethod learns positional prompts from coarse patch annotations that are easier\nand cheaper to acquire. We train classifiers on image patches to identify part\nclasses and aggregate patches into regions of interest (ROIs) with positional\nprompts. SAM is conditioned on these ROIs and prompts. This approach, termed\n`Guided SAM', enhances efficiency and reduces manual effort, allowing effective\npart segmentation with minimal labeled data. We demonstrate the efficacy of\nGuided SAM on a dataset of car parts, improving the average IoU on state of the\nart models from 0.37 to 0.49 with annotations that are on average five times\nmore efficient to acquire.\n","authors":["S. B. van Rooij","G. J. Burghouts"],"pdf_url":"https://arxiv.org/pdf/2501.07434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07430v1","updated":"2025-01-13T15:54:21Z","published":"2025-01-13T15:54:21Z","title":"Diff-Ensembler: Learning to Ensemble 2D Diffusion Models for\n Volume-to-Volume Medical Image Translation","summary":" Despite success in volume-to-volume translations in medical images, most\nexisting models struggle to effectively capture the inherent volumetric\ndistribution using 3D representations. The current state-of-the-art approach\ncombines multiple 2D-based networks through weighted averaging, thereby\nneglecting the 3D spatial structures. Directly training 3D models in medical\nimaging presents significant challenges due to high computational demands and\nthe need for large-scale datasets. To address these challenges, we introduce\nDiff-Ensembler, a novel hybrid 2D-3D model for efficient and effective\nvolumetric translations by ensembling perpendicularly trained 2D diffusion\nmodels with a 3D network in each diffusion step. Moreover, our model can\nnaturally be used to ensemble diffusion models conditioned on different\nmodalities, allowing flexible and accurate fusion of input conditions.\nExtensive experiments demonstrate that Diff-Ensembler attains superior accuracy\nand volumetric realism in 3D medical image super-resolution and modality\ntranslation. We further demonstrate the strength of our model's volumetric\nrealism using tumor segmentation as a downstream task.\n","authors":["Xiyue Zhu","Dou Hoon Kwark","Ruike Zhu","Kaiwen Hong","Yiqi Tao","Shirui Luo","Yudu Li","Zhi-Pei Liang","Volodymyr Kindratenko"],"pdf_url":"https://arxiv.org/pdf/2501.07430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00843v2","updated":"2025-01-13T15:48:06Z","published":"2025-01-01T13:51:03Z","title":"FusionSORT: Fusion Methods for Online Multi-object Visual Tracking","summary":" In this work, we investigate four different fusion methods for associating\ndetections to tracklets in multi-object visual tracking. In addition to\nconsidering strong cues such as motion and appearance information, we also\nconsider weak cues such as height intersection-over-union (height-IoU) and\ntracklet confidence information in the data association using different fusion\nmethods. These fusion methods include minimum, weighted sum based on IoU,\nKalman filter (KF) gating, and hadamard product of costs due to the different\ncues. We conduct extensive evaluations on validation sets of MOT17, MOT20 and\nDanceTrack datasets, and find out that the choice of a fusion method is key for\ndata association in multi-object visual tracking. We hope that this\ninvestigative work helps the computer vision research community to use the\nright fusion method for data association in multi-object visual tracking.\n","authors":["Nathanael L. Baisa"],"pdf_url":"https://arxiv.org/pdf/2501.00843v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05226v2","updated":"2025-01-13T15:30:39Z","published":"2025-01-09T13:29:54Z","title":"Light Transport-aware Diffusion Posterior Sampling for Single-View\n Reconstruction of 3D Volumes","summary":" We introduce a single-view reconstruction technique of volumetric fields in\nwhich multiple light scattering effects are omnipresent, such as in clouds. We\nmodel the unknown distribution of volumetric fields using an unconditional\ndiffusion model trained on a novel benchmark dataset comprising 1,000\nsynthetically simulated volumetric density fields. The neural diffusion model\nis trained on the latent codes of a novel, diffusion-friendly, monoplanar\nrepresentation. The generative model is used to incorporate a tailored\nparametric diffusion posterior sampling technique into different reconstruction\ntasks. A physically-based differentiable volume renderer is employed to provide\ngradients with respect to light transport in the latent space. This stands in\ncontrast to classic NeRF approaches and makes the reconstructions better\naligned with observed data. Through various experiments, we demonstrate\nsingle-view reconstruction of volumetric clouds at a previously unattainable\nquality.\n","authors":["Ludwic Leonard","Nils Thuerey","Ruediger Westermann"],"pdf_url":"https://arxiv.org/pdf/2501.05226v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.08926v3","updated":"2025-01-13T15:19:14Z","published":"2024-10-11T15:50:53Z","title":"Zero-Shot Pupil Segmentation with SAM 2: A Case Study of Over 14 Million\n Images","summary":" We explore the transformative potential of SAM 2, a vision foundation model,\nin advancing gaze estimation and eye tracking technologies. By significantly\nreducing annotation time, lowering technical barriers through its ease of\ndeployment, and enhancing segmentation accuracy, SAM 2 addresses critical\nchallenges faced by researchers and practitioners. Utilizing its zero-shot\nsegmentation capabilities with minimal user input-a single click per video-we\ntested SAM 2 on over 14 million eye images from diverse datasets, including\nvirtual reality setups and the world's largest unified dataset recorded using\nwearable eye trackers. Remarkably, in pupil segmentation tasks, SAM 2 matches\nthe performance of domain-specific models trained solely on eye images,\nachieving competitive mean Intersection over Union (mIoU) scores of up to 93%\nwithout fine-tuning. Additionally, we provide our code and segmentation masks\nfor these widely used datasets to promote further research.\n","authors":["Virmarie Maquiling","Sean Anthony Byrne","Diederick C. Niehorster","Marco Carminati","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2410.08926v3.pdf","comment":"Virmarie Maquiling and Sean Anthony Byrne contributed equally to this\n paper, 8 pages, 3 figures, ETRA 2025, pre-print"},{"id":"http://arxiv.org/abs/2501.07397v1","updated":"2025-01-13T15:12:40Z","published":"2025-01-13T15:12:40Z","title":"OCORD: Open-Campus Object Removal Dataset","summary":" The rapid advancements in generative models, particularly diffusion-based\ntechniques, have revolutionized image inpainting tasks by enabling the\ngeneration of high-fidelity and diverse content. However, object removal\nremains under-explored as a specific subset of inpainting, facing challenges\nsuch as inadequate semantic understanding and the unintended generation of\nartifacts. Existing datasets for object removal often rely on synthetic data,\nwhich fails to align with real-world scenarios, limiting model performance.\nAlthough some real-world datasets address these issues partially, they suffer\nfrom scalability, annotation inefficiencies, and limited realism in physical\nphenomena such as lighting and shadows. To address these limitations, this\npaper introduces a novel approach to object removal by constructing a\nhigh-resolution real-world dataset through long-duration video capture with\nfixed camera settings. Leveraging advanced tools such as Grounding-DINO,\nSegment-Anything-Model, and MASA for automated annotation, we provides image,\nbackground, and mask pairs while significantly reducing annotation time and\nlabor. With our efficient annotation pipeline, we release the first fully open,\nhigh-resolution real-world dataset for object removal, and improved performance\nin object removal tasks through fine-tuning of pre-trained diffusion models.\n","authors":["Shuo Zhang","Runpu Wei","Kongming Liang"],"pdf_url":"https://arxiv.org/pdf/2501.07397v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2501.07396v1","updated":"2025-01-13T15:11:27Z","published":"2025-01-13T15:11:27Z","title":"Zero-Shot Scene Understanding for Automatic Target Recognition Using\n Large Vision-Language Models","summary":" Automatic target recognition (ATR) plays a critical role in tasks such as\nnavigation and surveillance, where safety and accuracy are paramount. In\nextreme use cases, such as military applications, these factors are often\nchallenged due to the presence of unknown terrains, environmental conditions,\nand novel object categories. Current object detectors, including open-world\ndetectors, lack the ability to confidently recognize novel objects or operate\nin unknown environments, as they have not been exposed to these new conditions.\nHowever, Large Vision-Language Models (LVLMs) exhibit emergent properties that\nenable them to recognize objects in varying conditions in a zero-shot manner.\nDespite this, LVLMs struggle to localize objects effectively within a scene. To\naddress these limitations, we propose a novel pipeline that combines the\ndetection capabilities of open-world detectors with the recognition confidence\nof LVLMs, creating a robust system for zero-shot ATR of novel classes and\nunknown domains. In this study, we compare the performance of various LVLMs for\nrecognizing military vehicles, which are often underrepresented in training\ndatasets. Additionally, we examine the impact of factors such as distance\nrange, modality, and prompting methods on the recognition performance,\nproviding insights into the development of more reliable ATR systems for novel\nconditions and classes.\n","authors":["Yasiru Ranasinghe","Vibashan VS","James Uplinger","Celso De Melo","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2501.07396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07390v1","updated":"2025-01-13T15:06:51Z","published":"2025-01-13T15:06:51Z","title":"Kolmogorov-Arnold Network for Remote Sensing Image Semantic Segmentation","summary":" Semantic segmentation plays a crucial role in remote sensing applications,\nwhere the accurate extraction and representation of features are essential for\nhigh-quality results. Despite the widespread use of encoder-decoder\narchitectures, existing methods often struggle with fully utilizing the\nhigh-dimensional features extracted by the encoder and efficiently recovering\ndetailed information during decoding. To address these problems, we propose a\nnovel semantic segmentation network, namely DeepKANSeg, including two key\ninnovations based on the emerging Kolmogorov Arnold Network (KAN). Notably, the\nadvantage of KAN lies in its ability to decompose high-dimensional complex\nfunctions into univariate transformations, enabling efficient and flexible\nrepresentation of intricate relationships in data. First, we introduce a\nKAN-based deep feature refinement module, namely DeepKAN to effectively capture\ncomplex spatial and rich semantic relationships from high-dimensional features.\nSecond, we replace the traditional multi-layer perceptron (MLP) layers in the\nglobal-local combined decoder with KAN-based linear layers, namely GLKAN. This\nmodule enhances the decoder's ability to capture fine-grained details during\ndecoding. To evaluate the effectiveness of the proposed method, experiments are\nconducted on two well-known fine-resolution remote sensing benchmark datasets,\nnamely ISPRS Vaihingen and ISPRS Potsdam. The results demonstrate that the\nKAN-enhanced segmentation model achieves superior performance in terms of\naccuracy compared to state-of-the-art methods. They highlight the potential of\nKANs as a powerful alternative to traditional architectures in semantic\nsegmentation tasks. Moreover, the explicit univariate decomposition provides\nimproved interpretability, which is particularly beneficial for applications\nrequiring explainable learning in remote sensing.\n","authors":["Xianping Ma","Ziyao Wang","Yin Hu","Xiaokang Zhang","Man-On Pun"],"pdf_url":"https://arxiv.org/pdf/2501.07390v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.07378v1","updated":"2025-01-13T14:54:49Z","published":"2025-01-13T14:54:49Z","title":"FedSemiDG: Domain Generalized Federated Semi-supervised Medical Image\n Segmentation","summary":" Medical image segmentation is challenging due to the diversity of medical\nimages and the lack of labeled data, which motivates recent developments in\nfederated semi-supervised learning (FSSL) to leverage a large amount of\nunlabeled data from multiple centers for model training without sharing raw\ndata. However, what remains under-explored in FSSL is the domain shift problem\nwhich may cause suboptimal model aggregation and low effectivity of the\nutilization of unlabeled data, eventually leading to unsatisfactory performance\nin unseen domains. In this paper, we explore this previously ignored scenario,\nnamely domain generalized federated semi-supervised learning (FedSemiDG), which\naims to learn a model in a distributed manner from multiple domains with\nlimited labeled data and abundant unlabeled data such that the model can\ngeneralize well to unseen domains. We present a novel framework, Federated\nGeneralization-Aware SemiSupervised Learning (FGASL), to address the challenges\nin FedSemiDG by effectively tackling critical issues at both global and local\nlevels. Globally, we introduce Generalization-Aware Aggregation (GAA),\nassigning adaptive weights to local models based on their generalization\nperformance. Locally, we use a Dual-Teacher Adaptive Pseudo Label Refinement\n(DR) strategy to combine global and domain-specific knowledge, generating more\nreliable pseudo labels. Additionally, Perturbation-Invariant Alignment (PIA)\nenforces feature consistency under perturbations, promoting domain-invariant\nlearning. Extensive experiments on three medical segmentation tasks (cardiac\nMRI, spine MRI and bladder cancer MRI) demonstrate that our method\nsignificantly outperforms state-of-the-art FSSL and domain generalization\napproaches, achieving robust generalization on unseen domains.\n","authors":["Zhipeng Deng","Zhe Xu","Tsuyoshi Isshiki","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.07378v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2412.05271v4","updated":"2025-01-13T14:42:20Z","published":"2024-12-06T18:57:08Z","title":"Expanding Performance Boundaries of Open-Source Multimodal Models with\n Model, Data, and Test-Time Scaling","summary":" We introduce InternVL 2.5, an advanced multimodal large language model (MLLM)\nseries that builds upon InternVL 2.0, maintaining its core model architecture\nwhile introducing significant enhancements in training and testing strategies\nas well as data quality. In this work, we delve into the relationship between\nmodel scaling and performance, systematically exploring the performance trends\nin vision encoders, language models, dataset sizes, and test-time\nconfigurations. Through extensive evaluations on a wide range of benchmarks,\nincluding multi-discipline reasoning, document understanding, multi-image /\nvideo understanding, real-world comprehension, multimodal hallucination\ndetection, visual grounding, multilingual capabilities, and pure language\nprocessing, InternVL 2.5 exhibits competitive performance, rivaling leading\ncommercial models such as GPT-4o and Claude-3.5-Sonnet. Notably, our model is\nthe first open-source MLLMs to surpass 70% on the MMMU benchmark, achieving a\n3.7-point improvement through Chain-of-Thought (CoT) reasoning and showcasing\nstrong potential for test-time scaling. We hope this model contributes to the\nopen-source community by setting new standards for developing and applying\nmultimodal AI systems. HuggingFace demo see\nhttps://huggingface.co/spaces/OpenGVLab/InternVL\n","authors":["Zhe Chen","Weiyun Wang","Yue Cao","Yangzhou Liu","Zhangwei Gao","Erfei Cui","Jinguo Zhu","Shenglong Ye","Hao Tian","Zhaoyang Liu","Lixin Gu","Xuehui Wang","Qingyun Li","Yimin Ren","Zixuan Chen","Jiapeng Luo","Jiahao Wang","Tan Jiang","Bo Wang","Conghui He","Botian Shi","Xingcheng Zhang","Han Lv","Yi Wang","Wenqi Shao","Pei Chu","Zhongying Tu","Tong He","Zhiyong Wu","Huipeng Deng","Jiaye Ge","Kai Chen","Kaipeng Zhang","Limin Wang","Min Dou","Lewei Lu","Xizhou Zhu","Tong Lu","Dahua Lin","Yu Qiao","Jifeng Dai","Wenhai Wang"],"pdf_url":"https://arxiv.org/pdf/2412.05271v4.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2412.09718v2","updated":"2025-01-13T14:37:52Z","published":"2024-12-12T20:48:06Z","title":"BayesAdapter: enhanced uncertainty estimation in CLIP few-shot\n adaptation","summary":" The emergence of large pre-trained vision-language models (VLMs) represents a\nparadigm shift in machine learning, with unprecedented results in a broad span\nof visual recognition tasks. CLIP, one of the most popular VLMs, has exhibited\nremarkable zero-shot and transfer learning capabilities in classification. To\ntransfer CLIP to downstream tasks, adapters constitute a parameter-efficient\napproach that avoids backpropagation through the large model (unlike related\nprompt learning methods). However, CLIP adapters have been developed to target\ndiscriminative performance, and the quality of their uncertainty estimates has\nbeen overlooked. In this work we show that the discriminative performance of\nstate-of-the-art CLIP adapters does not always correlate with their uncertainty\nestimation capabilities, which are essential for a safe deployment in\nreal-world scenarios. We also demonstrate that one of such adapters is obtained\nthrough MAP inference from a more general probabilistic framework. Based on\nthis observation we introduce BayesAdapter, which leverages Bayesian inference\nto estimate a full probability distribution instead of a single point, better\ncapturing the variability inherent in the parameter space. In a comprehensive\nempirical evaluation we show that our approach obtains high quality uncertainty\nestimates in the predictions, standing out in calibration and selective\nclassification. Our code will be publicly available upon acceptance of the\npaper.\n","authors":["Pablo Morales-Álvarez","Stergios Christodoulidis","Maria Vakalopoulou","Pablo Piantanida","Jose Dolz"],"pdf_url":"https://arxiv.org/pdf/2412.09718v2.pdf","comment":"30 pages, 5 figures, 23 tables"},{"id":"http://arxiv.org/abs/2406.16531v2","updated":"2025-01-13T14:34:40Z","published":"2024-06-24T11:10:41Z","title":"GIM: A Million-scale Benchmark for Generative Image Manipulation\n Detection and Localization","summary":" The extraordinary ability of generative models emerges as a new trend in\nimage editing and generating realistic images, posing a serious threat to the\ntrustworthiness of multimedia data and driving the research of image\nmanipulation detection and location (IMDL). However, the lack of a large-scale\ndata foundation makes the IMDL task unattainable. In this paper, we build a\nlocal manipulation data generation pipeline that integrates the powerful\ncapabilities of SAM, LLM, and generative models. Upon this basis, we propose\nthe GIM dataset, which has the following advantages: 1) Large scale, GIM\nincludes over one million pairs of AI-manipulated images and real images. 2)\nRich image content, GIM encompasses a broad range of image classes. 3) Diverse\ngenerative manipulation, the images are manipulated images with\nstate-of-the-art generators and various manipulation tasks. The aforementioned\nadvantages allow for a more comprehensive evaluation of IMDL methods, extending\ntheir applicability to diverse images. We introduce the GIM benchmark with two\nsettings to evaluate existing IMDL methods. In addition, we propose a novel\nIMDL framework, termed GIMFormer, which consists of a ShadowTracer,\nFrequency-Spatial block (FSB), and a Multi-Window Anomalous Modeling (MWAM)\nmodule. Extensive experiments on the GIM demonstrate that GIMFormer surpasses\nthe previous state-of-the-art approach on two different benchmarks.\n","authors":["Yirui Chen","Xudong Huang","Quan Zhang","Wei Li","Mingjian Zhu","Qiangyu Yan","Simiao Li","Hanting Chen","Hailin Hu","Jie Yang","Wei Liu","Jie Hu"],"pdf_url":"https://arxiv.org/pdf/2406.16531v2.pdf","comment":"Code page: https://github.com/chenyirui/GIM"},{"id":"http://arxiv.org/abs/2404.16432v5","updated":"2025-01-13T14:34:18Z","published":"2024-04-25T09:07:19Z","title":"Point-JEPA: A Joint Embedding Predictive Architecture for\n Self-Supervised Learning on Point Cloud","summary":" Recent advancements in self-supervised learning in the point cloud domain\nhave demonstrated significant potential. However, these methods often suffer\nfrom drawbacks, including lengthy pre-training time, the necessity of\nreconstruction in the input space, or the necessity of additional modalities.\nIn order to address these issues, we introduce Point-JEPA, a joint embedding\npredictive architecture designed specifically for point cloud data. To this\nend, we introduce a sequencer that orders point cloud patch embeddings to\nefficiently compute and utilize their proximity based on the indices during\ntarget and context selection. The sequencer also allows shared computations of\nthe patch embeddings' proximity between context and target selection, further\nimproving the efficiency. Experimentally, our method achieves competitive\nresults with state-of-the-art methods while avoiding the reconstruction in the\ninput space or additional modality.\n","authors":["Ayumu Saito","Prachi Kudeshia","Jiju Poovvancheri"],"pdf_url":"https://arxiv.org/pdf/2404.16432v5.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.07360v1","updated":"2025-01-13T14:30:01Z","published":"2025-01-13T14:30:01Z","title":"TimberVision: A Multi-Task Dataset and Framework for Log-Component\n Segmentation and Tracking in Autonomous Forestry Operations","summary":" Timber represents an increasingly valuable and versatile resource. However,\nforestry operations such as harvesting, handling and measuring logs still\nrequire substantial human labor in remote environments posing significant\nsafety risks. Progressively automating these tasks has the potential of\nincreasing their efficiency as well as safety, but requires an accurate\ndetection of individual logs as well as live trees and their context. Although\ninitial approaches have been proposed for this challenging application domain,\nspecialized data and algorithms are still too scarce to develop robust\nsolutions. To mitigate this gap, we introduce the TimberVision dataset,\nconsisting of more than 2k annotated RGB images containing a total of 51k trunk\ncomponents including cut and lateral surfaces, thereby surpassing any existing\ndataset in this domain in terms of both quantity and detail by a large margin.\nBased on this data, we conduct a series of ablation experiments for oriented\nobject detection and instance segmentation and evaluate the influence of\nmultiple scene parameters on model performance. We introduce a generic\nframework to fuse the components detected by our models for both tasks into\nunified trunk representations. Furthermore, we automatically derive geometric\nproperties and apply multi-object tracking to further enhance robustness. Our\ndetection and tracking approach provides highly descriptive and accurate trunk\nrepresentations solely from RGB image data, even under challenging\nenvironmental conditions. Our solution is suitable for a wide range of\napplication scenarios and can be readily combined with other sensor modalities.\n","authors":["Daniel Steininger","Julia Simon","Andreas Trondl","Markus Murschitz"],"pdf_url":"https://arxiv.org/pdf/2501.07360v1.pdf","comment":"Accepted at Winter Conference on Applications of Computer Vision\n (WACV) 2025. Code and dataset available at\n https://github.com/timbervision/timbervision"},{"id":"http://arxiv.org/abs/2501.03836v2","updated":"2025-01-13T14:10:16Z","published":"2025-01-07T14:45:39Z","title":"SCC-YOLO: An Improved Object Detector for Assisting in Brain Tumor\n Diagnosis","summary":" Brain tumors can result in neurological dysfunction, alterations in cognitive\nand psychological states, increased intracranial pressure, and the occurrence\nof seizures, thereby presenting a substantial risk to human life and health.\nThe You Only Look Once(YOLO) series models have demonstrated superior accuracy\nin object detection for medical imaging. In this paper, we develop a novel\nSCC-YOLO architecture by integrating the SCConv attention mechanism into\nYOLOv9. The SCConv module reconstructs an efficient convolutional module by\nreducing spatial and channel redundancy among features, thereby enhancing the\nlearning of image features. We investigate the impact of intergrating different\nattention mechanisms with the YOLOv9 model on brain tumor image detection using\nboth the Br35H dataset and our self-made dataset(Brain_Tumor_Dataset).\nExperimental results show that on the Br35H dataset, SCC-YOLO achieved a 0.3%\nimprovement in mAp50 compared to YOLOv9, while on our self-made dataset,\nSCC-YOLO exhibited a 0.5% improvement over YOLOv9. SCC-YOLO has reached\nstate-of-the-art performance in brain tumor detection. Source code is available\nat : https://jihulab.com/healthcare-information-studio/SCC-YOLO/-/tree/master\n","authors":["Runci Bai"],"pdf_url":"https://arxiv.org/pdf/2501.03836v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.15770v2","updated":"2025-01-13T14:00:18Z","published":"2024-11-24T09:48:03Z","title":"Text-Guided Coarse-to-Fine Fusion Network for Robust Remote Sensing\n Visual Question Answering","summary":" Remote Sensing Visual Question Answering (RSVQA) has gained significant\nresearch interest. However, current RSVQA methods are limited by the imaging\nmechanisms of optical sensors, particularly under challenging conditions such\nas cloud-covered and low-light scenarios. Given the all-time and all-weather\nimaging capabilities of Synthetic Aperture Radar (SAR), it is crucial to\ninvestigate the integration of optical-SAR images to improve RSVQA performance.\nIn this work, we propose a Text-guided Coarse-to-Fine Fusion Network (TGFNet),\nwhich leverages the semantic relationships between question text and\nmulti-source images to guide the network toward complementary fusion at the\nfeature level. Specifically, we develop a Text-guided Coarse-to-Fine Attention\nRefinement (CFAR) module to focus on key areas related to the question in\ncomplex remote sensing images. This module progressively directs attention from\nbroad areas to finer details through key region routing, enhancing the model's\nability to focus on relevant regions. Furthermore, we propose an Adaptive\nMulti-Expert Fusion (AMEF) module that dynamically integrates different\nexperts, enabling the adaptive fusion of optical and SAR features. In addition,\nwe create the first large-scale benchmark dataset for evaluating optical-SAR\nRSVQA methods, comprising 6,008 well-aligned optical-SAR image pairs and\n1,036,694 well-labeled question-answer pairs across 16 diverse question types,\nincluding complex relational reasoning questions. Extensive experiments on the\nproposed dataset demonstrate that our TGFNet effectively integrates\ncomplementary information between optical and SAR images, significantly\nimproving the model's performance in challenging scenarios. The dataset is\navailable at: https://github.com/mmic-lcl/.\n Index Terms: Remote Sensing Visual Question Answering, Multi-source Data\nFusion, Multimodal, Remote Sensing, OPT-SAR.\n","authors":["Zhicheng Zhao","Changfu Zhou","Yu Zhang","Chenglong Li","Xiaoliang Ma","Jin Tang"],"pdf_url":"https://arxiv.org/pdf/2411.15770v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07342v1","updated":"2025-01-13T13:56:31Z","published":"2025-01-13T13:56:31Z","title":"A method for estimating roadway billboard salience","summary":" Roadside billboards and other forms of outdoor advertising play a crucial\nrole in marketing initiatives; however, they can also distract drivers,\npotentially contributing to accidents. This study delves into the significance\nof roadside advertising in images captured from a driver's perspective.\nFirstly, it evaluates the effectiveness of neural networks in detecting\nadvertising along roads, focusing on the YOLOv5 and Faster R-CNN models.\nSecondly, the study addresses the determination of billboard significance using\nmethods for saliency extraction. The UniSal and SpectralResidual methods were\nemployed to create saliency maps for each image. The study establishes a\ndatabase of eye tracking sessions captured during city highway driving to\nassess the saliency models.\n","authors":["Zuzana Berger Haladova","Michal Zrubec","Zuzana Cernekova"],"pdf_url":"https://arxiv.org/pdf/2501.07342v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05450v2","updated":"2025-01-13T13:54:31Z","published":"2024-10-07T19:34:25Z","title":"AI-Driven Early Mental Health Screening: Analyzing Selfies of Pregnant\n Women","summary":" Major Depressive Disorder and anxiety disorders affect millions globally,\ncontributing significantly to the burden of mental health issues. Early\nscreening is crucial for effective intervention, as timely identification of\nmental health issues can significantly improve treatment outcomes. Artificial\nintelligence (AI) can be valuable for improving the screening of mental\ndisorders, enabling early intervention and better treatment outcomes. AI-driven\nscreening can leverage the analysis of multiple data sources, including facial\nfeatures in digital images. However, existing methods often rely on controlled\nenvironments or specialized equipment, limiting their broad applicability. This\nstudy explores the potential of AI models for ubiquitous depression-anxiety\nscreening given face-centric selfies. The investigation focuses on high-risk\npregnant patients, a population that is particularly vulnerable to mental\nhealth issues. To cope with limited training data resulting from our clinical\nsetup, pre-trained models were utilized in two different approaches:\nfine-tuning convolutional neural networks (CNNs) originally designed for facial\nexpression recognition and employing vision-language models (VLMs) for\nzero-shot analysis of facial expressions. Experimental results indicate that\nthe proposed VLM-based method significantly outperforms CNNs, achieving an\naccuracy of 77.6%. Although there is significant room for improvement, the\nresults suggest that VLMs can be a promising approach for mental health\nscreening.\n","authors":["Gustavo A. Basílio","Thiago B. Pereira","Alessandro L. Koerich","Hermano Tavares","Ludmila Dias","Maria das Graças da S. Teixeira","Rafael T. Sousa","Wilian H. Hisatugu","Amanda S. Mota","Anilton S. Garcia","Marco Aurélio K. Galletta","Thiago M. Paixão"],"pdf_url":"https://arxiv.org/pdf/2410.05450v2.pdf","comment":"This article has been accepted for publication in HEALTHINF25 at the\n 18th International Joint Conference on Biomedical Engineering Systems and\n Technologies (BIOSTEC 2025)"},{"id":"http://arxiv.org/abs/2501.07334v1","updated":"2025-01-13T13:47:00Z","published":"2025-01-13T13:47:00Z","title":"Anonymization of Documents for Law Enforcement with Machine Learning","summary":" The steadily increasing utilization of data-driven methods and approaches in\nareas that handle sensitive personal information such as in law enforcement\nmandates an ever increasing effort in these institutions to comply with data\nprotection guidelines. In this work, we present a system for automatically\nanonymizing images of scanned documents, reducing manual effort while ensuring\ndata protection compliance. Our method considers the viability of further\nforensic processing after anonymization by minimizing automatically redacted\nareas by combining automatic detection of sensitive regions with knowledge from\na manually anonymized reference document. Using a self-supervised image model\nfor instance retrieval of the reference document, our approach requires only\none anonymized example to efficiently redact all documents of the same type,\nsignificantly reducing processing time. We show that our approach outperforms\nboth a purely automatic redaction system and also a naive copy-paste scheme of\nthe reference anonymization to other documents on a hand-crafted dataset of\nground truth redactions.\n","authors":["Manuel Eberhardinger","Patrick Takenaka","Daniel Grießhaber","Johannes Maucher"],"pdf_url":"https://arxiv.org/pdf/2501.07334v1.pdf","comment":"Accepted at IEEE Symposium on CI in Security, Defence and Biometrics\n 2025 (IEEE CISDB)"},{"id":"http://arxiv.org/abs/2403.15517v2","updated":"2025-01-13T13:32:48Z","published":"2024-03-22T11:14:30Z","title":"Improving Forward Compatibility in Class Incremental Learning by\n Increasing Representation Rank and Feature Richness","summary":" Class Incremental Learning (CIL) constitutes a pivotal subfield within\ncontinual learning, aimed at enabling models to progressively learn new\nclassification tasks while retaining knowledge obtained from prior tasks.\nAlthough previous studies have predominantly focused on backward compatible\napproaches to mitigate catastrophic forgetting, recent investigations have\nintroduced forward compatible methods to enhance performance on novel tasks and\ncomplement existing backward compatible methods. In this study, we introduce an\neffective-Rank based Feature Richness enhancement (RFR) method, designed for\nimproving forward compatibility. Specifically, this method increases the\neffective rank of representations during the base session, thereby facilitating\nthe incorporation of more informative features pertinent to unseen novel tasks.\nConsequently, RFR achieves dual objectives in backward and forward\ncompatibility: minimizing feature extractor modifications and enhancing novel\ntask performance, respectively. To validate the efficacy of our approach, we\nestablish a theoretical connection between effective rank and the Shannon\nentropy of representations. Subsequently, we conduct comprehensive experiments\nby integrating RFR into eleven well-known CIL methods. Our results demonstrate\nthe effectiveness of our approach in enhancing novel-task performance while\nmitigating catastrophic forgetting. Furthermore, our method notably improves\nthe average incremental accuracy across all eleven cases examined.\n","authors":["Jaeill Kim","Wonseok Lee","Moonjung Eo","Wonjong Rhee"],"pdf_url":"https://arxiv.org/pdf/2403.15517v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07312v1","updated":"2025-01-13T13:24:41Z","published":"2025-01-13T13:24:41Z","title":"Localization-Aware Multi-Scale Representation Learning for Repetitive\n Action Counting","summary":" Repetitive action counting (RAC) aims to estimate the number of\nclass-agnostic action occurrences in a video without exemplars. Most current\nRAC methods rely on a raw frame-to-frame similarity representation for period\nprediction. However, this approach can be significantly disrupted by common\nnoise such as action interruptions and inconsistencies, leading to sub-optimal\ncounting performance in realistic scenarios. In this paper, we introduce a\nforeground localization optimization objective into similarity representation\nlearning to obtain more robust and efficient video features. We propose a\nLocalization-Aware Multi-Scale Representation Learning (LMRL) framework.\nSpecifically, we apply a Multi-Scale Period-Aware Representation (MPR) with a\nscale-specific design to accommodate various action frequencies and learn more\nflexible temporal correlations. Furthermore, we introduce the Repetition\nForeground Localization (RFL) method, which enhances the representation by\ncoarsely identifying periodic actions and incorporating global semantic\ninformation. These two modules can be jointly optimized, resulting in a more\ndiscerning periodic action representation. Our approach significantly reduces\nthe impact of noise, thereby improving counting accuracy. Additionally, the\nframework is designed to be scalable and adaptable to different types of video\ncontent. Experimental results on the RepCountA and UCFRep datasets demonstrate\nthat our proposed method effectively handles repetitive action counting.\n","authors":["Sujia Wang","Xiangwei Shen","Yansong Tang","Xin Dong","Wenjia Geng","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07312v1.pdf","comment":"Accepted by IEEE VCIP2024"},{"id":"http://arxiv.org/abs/2501.07305v1","updated":"2025-01-13T13:13:06Z","published":"2025-01-13T13:13:06Z","title":"The Devil is in the Spurious Correlation: Boosting Moment Retrieval via\n Temporal Dynamic Learning","summary":" Given a textual query along with a corresponding video, the objective of\nmoment retrieval aims to localize the moments relevant to the query within the\nvideo. While commendable results have been demonstrated by existing\ntransformer-based approaches, predicting the accurate temporal span of the\ntarget moment is currently still a major challenge. In this paper, we reveal\nthat a crucial reason stems from the spurious correlation between the text\nqueries and the moment context. Namely, the model may associate the textual\nquery with the background frames rather than the target moment. To address this\nissue, we propose a temporal dynamic learning approach for moment retrieval,\nwhere two strategies are designed to mitigate the spurious correlation. First,\nwe introduce a novel video synthesis approach to construct a dynamic context\nfor the relevant moment. With separate yet similar videos mixed up, the\nsynthesis approach empowers our model to attend to the target moment of the\ncorresponding query under various dynamic contexts. Second, we enhance the\nrepresentation by learning temporal dynamics. Besides the visual\nrepresentation, text queries are aligned with temporal dynamic representations,\nwhich enables our model to establish a non-spurious correlation between the\nquery-related moment and context. With the aforementioned proposed method, the\nspurious correlation issue in moment retrieval can be largely alleviated. Our\nmethod establishes a new state-of-the-art performance on two popular benchmarks\nof moment retrieval, \\ie, QVHighlights and Charades-STA. In addition, the\ndetailed ablation analyses demonstrate the effectiveness of the proposed\nstrategies. Our code will be publicly available.\n","authors":["Xinyang Zhou","Fanyue Wei","Lixin Duan","Wen Li"],"pdf_url":"https://arxiv.org/pdf/2501.07305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07304v1","updated":"2025-01-13T13:12:18Z","published":"2025-01-13T13:12:18Z","title":"Code and Pixels: Multi-Modal Contrastive Pre-training for Enhanced\n Tabular Data Analysis","summary":" Learning from tabular data is of paramount importance, as it complements the\nconventional analysis of image and video data by providing a rich source of\nstructured information that is often critical for comprehensive understanding\nand decision-making processes. We present Multi-task Contrastive Masked Tabular\nModeling (MT-CMTM), a novel method aiming to enhance tabular models by\nleveraging the correlation between tabular data and corresponding images.\nMT-CMTM employs a dual strategy combining contrastive learning with masked\ntabular modeling, optimizing the synergy between these data modalities.\n Central to our approach is a 1D Convolutional Neural Network with residual\nconnections and an attention mechanism (1D-ResNet-CBAM), designed to\nefficiently process tabular data without relying on images. This enables\nMT-CMTM to handle purely tabular data for downstream tasks, eliminating the\nneed for potentially costly image acquisition and processing.\n We evaluated MT-CMTM on the DVM car dataset, which is uniquely suited for\nthis particular scenario, and the newly developed HIPMP dataset, which connects\nmembrane fabrication parameters with image data. Our MT-CMTM model outperforms\nthe proposed tabular 1D-ResNet-CBAM, which is trained from scratch, achieving a\nrelative 1.48% improvement in relative MSE on HIPMP and a 2.38% increase in\nabsolute accuracy on DVM. These results demonstrate MT-CMTM's robustness and\nits potential to advance the field of multi-modal learning.\n","authors":["Kankana Roy","Lars Krämer","Sebastian Domaschke","Malik Haris","Roland Aydin","Fabian Isensee","Martin Held"],"pdf_url":"https://arxiv.org/pdf/2501.07304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20287v5","updated":"2025-01-13T13:12:17Z","published":"2024-03-29T16:58:13Z","title":"Benchmarking Counterfactual Image Generation","summary":" Generative AI has revolutionised visual content editing, empowering users to\neffortlessly modify images and videos. However, not all edits are equal. To\nperform realistic edits in domains such as natural image or medical imaging,\nmodifications must respect causal relationships inherent to the data generation\nprocess. Such image editing falls into the counterfactual image generation\nregime. Evaluating counterfactual image generation is substantially complex:\nnot only it lacks observable ground truths, but also requires adherence to\ncausal constraints. Although several counterfactual image generation methods\nand evaluation metrics exist, a comprehensive comparison within a unified\nsetting is lacking. We present a comparison framework to thoroughly benchmark\ncounterfactual image generation methods. We integrate all models that have been\nused for the task at hand and expand them to novel datasets and causal graphs,\ndemonstrating the superiority of Hierarchical VAEs across most datasets and\nmetrics. Our framework is implemented in a user-friendly Python package that\ncan be extended to incorporate additional SCMs, causal methods, generative\nmodels, and datasets for the community to build on. Code:\nhttps://github.com/gulnazaki/counterfactual-benchmark.\n","authors":["Thomas Melistas","Nikos Spyrou","Nefeli Gkouti","Pedro Sanchez","Athanasios Vlontzos","Yannis Panagakis","Giorgos Papanastasiou","Sotirios A. Tsaftaris"],"pdf_url":"https://arxiv.org/pdf/2403.20287v5.pdf","comment":"Published as a conference paper at NeurIPS 2024 Datasets and\n Benchmarks Track https://openreview.net/forum?id=0T8xRFrScB Project page:\n https://gulnazaki.github.io/counterfactual-benchmark"},{"id":"http://arxiv.org/abs/2501.07300v1","updated":"2025-01-13T13:07:51Z","published":"2025-01-13T13:07:51Z","title":"Comparative analysis of optical character recognition methods for Sámi\n texts from the National Library of Norway","summary":" Optical Character Recognition (OCR) is crucial to the National Library of\nNorway's (NLN) digitisation process as it converts scanned documents into\nmachine-readable text. However, for the S\\'ami documents in NLN's collection,\nthe OCR accuracy is insufficient. Given that OCR quality affects downstream\nprocesses, evaluating and improving OCR for text written in S\\'ami languages is\nnecessary to make these resources accessible. To address this need, this work\nfine-tunes and evaluates three established OCR approaches, Transkribus,\nTesseract and TrOCR, for transcribing S\\'ami texts from NLN's collection. Our\nresults show that Transkribus and TrOCR outperform Tesseract on this task,\nwhile Tesseract achieves superior performance on an out-of-domain dataset.\nFurthermore, we show that fine-tuning pre-trained models and supplementing\nmanual annotations with machine annotations and synthetic text images can yield\naccurate OCR for S\\'ami languages, even with a moderate amount of manually\nannotated data.\n","authors":["Tita Enstad","Trond Trosterud","Marie Iversdatter Røsok","Yngvil Beyer","Marie Roald"],"pdf_url":"https://arxiv.org/pdf/2501.07300v1.pdf","comment":"To be published in Proceedings of the 25th Nordic Conference on\n Computational Linguistics (NoDaLiDa)"},{"id":"http://arxiv.org/abs/2410.22829v2","updated":"2025-01-13T13:07:25Z","published":"2024-10-30T09:11:25Z","title":"Situational Scene Graph for Structured Human-centric Situation\n Understanding","summary":" Graph based representation has been widely used in modelling spatio-temporal\nrelationships in video understanding. Although effective, existing graph-based\napproaches focus on capturing the human-object relationships while ignoring\nfine-grained semantic properties of the action components. These semantic\nproperties are crucial for understanding the current situation, such as where\ndoes the action takes place, what tools are used and functional properties of\nthe objects. In this work, we propose a graph-based representation called\nSituational Scene Graph (SSG) to encode both human-object relationships and the\ncorresponding semantic properties. The semantic details are represented as\npredefined roles and values inspired by situation frame, which is originally\ndesigned to represent a single action. Based on our proposed representation, we\nintroduce the task of situational scene graph generation and propose a\nmulti-stage pipeline Interactive and Complementary Network (InComNet) to\naddress the task. Given that the existing datasets are not applicable to the\ntask, we further introduce a SSG dataset whose annotations consist of semantic\nrole-value frames for human, objects and verb predicates of human-object\nrelations. Finally, we demonstrate the effectiveness of our proposed SSG\nrepresentation by testing on different downstream tasks. Experimental results\nshow that the unified representation can not only benefit predicate\nclassification and semantic role-value classification, but also benefit\nreasoning tasks on human-centric situation understanding. We will release the\ncode and the dataset soon.\n","authors":["Chinthani Sugandhika","Chen Li","Deepu Rajan","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2410.22829v2.pdf","comment":"Accepted for WACV 2025"},{"id":"http://arxiv.org/abs/2501.07297v1","updated":"2025-01-13T13:04:00Z","published":"2025-01-13T13:04:00Z","title":"Toward Realistic Camouflaged Object Detection: Benchmarks and Method","summary":" Camouflaged object detection (COD) primarily relies on semantic or instance\nsegmentation methods. While these methods have made significant advancements in\nidentifying the contours of camouflaged objects, they may be inefficient or\ncost-effective for tasks that only require the specific location of the object.\nObject detection algorithms offer an optimized solution for Realistic\nCamouflaged Object Detection (RCOD) in such cases. However, detecting\ncamouflaged objects remains a formidable challenge due to the high degree of\nsimilarity between the features of the objects and their backgrounds. Unlike\nsegmentation methods that perform pixel-wise comparisons to differentiate\nbetween foreground and background, object detectors omit this analysis, further\naggravating the challenge. To solve this problem, we propose a camouflage-aware\nfeature refinement (CAFR) strategy. Since camouflaged objects are not rare\ncategories, CAFR fully utilizes a clear perception of the current object within\nthe prior knowledge of large models to assist detectors in deeply understanding\nthe distinctions between background and foreground. Specifically, in CAFR, we\nintroduce the Adaptive Gradient Propagation (AGP) module that fine-tunes all\nfeature extractor layers in large detection models to fully refine\nclass-specific features from camouflaged contexts. We then design the Sparse\nFeature Refinement (SFR) module that optimizes the transformer-based feature\nextractor to focus primarily on capturing class-specific features in\ncamouflaged scenarios. To facilitate the assessment of RCOD tasks, we manually\nannotate the labels required for detection on three existing segmentation COD\ndatasets, creating a new benchmark for RCOD tasks. Code and datasets are\navailable at: https://github.com/zhimengXin/RCOD.\n","authors":["Zhimeng Xin","Tianxu Wu","Shiming Chen","Shuo Ye","Zijing Xie","Yixiong Zou","Xinge You","Yufei Guo"],"pdf_url":"https://arxiv.org/pdf/2501.07297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07296v1","updated":"2025-01-13T13:03:28Z","published":"2025-01-13T13:03:28Z","title":"Event-based Video Person Re-identification via Cross-Modality and\n Temporal Collaboration","summary":" Video-based person re-identification (ReID) has become increasingly important\ndue to its applications in video surveillance applications. By employing events\nin video-based person ReID, more motion information can be provided between\ncontinuous frames to improve recognition accuracy. Previous approaches have\nassisted by introducing event data into the video person ReID task, but they\nstill cannot avoid the privacy leakage problem caused by RGB images. In order\nto avoid privacy attacks and to take advantage of the benefits of event data,\nwe consider using only event data. To make full use of the information in the\nevent stream, we propose a Cross-Modality and Temporal Collaboration (CMTC)\nnetwork for event-based video person ReID. First, we design an event transform\nnetwork to obtain corresponding auxiliary information from the input of raw\nevents. Additionally, we propose a differential modality collaboration module\nto balance the roles of events and auxiliaries to achieve complementary\neffects. Furthermore, we introduce a temporal collaboration module to exploit\nmotion information and appearance cues. Experimental results demonstrate that\nour method outperforms others in the task of event-based video person ReID.\n","authors":["Renkai Li","Xin Yuan","Wei Liu","Xin Xu"],"pdf_url":"https://arxiv.org/pdf/2501.07296v1.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.01311v2","updated":"2025-01-13T12:42:14Z","published":"2025-01-02T15:47:56Z","title":"Multi-Head Explainer: A General Framework to Improve Explainability in\n CNNs and Transformers","summary":" In this study, we introduce the Multi-Head Explainer (MHEX), a versatile and\nmodular framework that enhances both the explainability and accuracy of\nConvolutional Neural Networks (CNNs) and Transformer-based models. MHEX\nconsists of three core components: an Attention Gate that dynamically\nhighlights task-relevant features, Deep Supervision that guides early layers to\ncapture fine-grained details pertinent to the target class, and an Equivalent\nMatrix that unifies refined local and global representations to generate\ncomprehensive saliency maps. Our approach demonstrates superior compatibility,\nenabling effortless integration into existing residual networks like ResNet and\nTransformer architectures such as BERT with minimal modifications. Extensive\nexperiments on benchmark datasets in medical imaging and text classification\nshow that MHEX not only improves classification accuracy but also produces\nhighly interpretable and detailed saliency scores.\n","authors":["Bohang Sun","Pietro Liò"],"pdf_url":"https://arxiv.org/pdf/2501.01311v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14128v2","updated":"2025-01-13T12:23:55Z","published":"2024-07-19T08:56:12Z","title":"OCTolyzer: Fully automatic toolkit for segmentation and feature\n extracting in optical coherence tomography and scanning laser ophthalmoscopy\n data","summary":" Optical coherence tomography (OCT) and scanning laser ophthalmoscopy (SLO) of\nthe eye has become essential to ophthalmology and the emerging field of\noculomics, thus requiring a need for transparent, reproducible, and rapid\nanalysis of this data for clinical research and the wider research community.\nHere, we introduce OCTolyzer, the first open-source toolkit for retinochoroidal\nanalysis in OCT/SLO data. It features two analysis suites for OCT and SLO data,\nfacilitating deep learning-based anatomical segmentation and feature extraction\nof the cross-sectional retinal and choroidal layers and en face retinal\nvessels. We describe OCTolyzer and evaluate the reproducibility of its OCT\nchoroid analysis. At the population level, metrics for choroid region thickness\nwere highly reproducible, with a mean absolute error (MAE)/Pearson correlation\nfor macular volume choroid thickness (CT) of 6.7$\\mu$m/0.99, macular B-scan CT\nof 11.6$\\mu$m/0.99, and peripapillary CT of 5.0$\\mu$m/0.99. Macular choroid\nvascular index (CVI) also showed strong reproducibility, with MAE/Pearson for\nvolume CVI yielding 0.0271/0.97 and B-scan CVI 0.0130/0.91. At the eye level,\nmeasurement noise for regional and vessel metrics was below 5% and 20% of the\npopulation's variability, respectively. Outliers were caused by poor-quality\nB-scans with thick choroids and invisible choroid-sclera boundary. Processing\ntimes on a laptop CPU were under three seconds for macular/peripapillary\nB-scans and 85 seconds for volume scans. OCTolyzer can convert OCT/SLO data\ninto reproducible and clinically meaningful retinochoroidal features and will\nimprove the standardisation of ocular measurements in OCT/SLO image analysis,\nrequiring no specialised training or proprietary software to be used. OCTolyzer\nis freely available here: https://github.com/jaburke166/OCTolyzer.\n","authors":["Jamie Burke","Justin Engelmann","Samuel Gibbon","Charlene Hamid","Diana Moukaddem","Dan Pugh","Tariq Farrah","Niall Strang","Neeraj Dhaun","Tom MacGillivray","Stuart King","Ian J. C. MacCormick"],"pdf_url":"https://arxiv.org/pdf/2407.14128v2.pdf","comment":"Main paper: 15 pages, 9 figures, 3 tables. Supplementary material: 9\n pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2412.10351v2","updated":"2025-01-13T12:22:52Z","published":"2024-12-13T18:47:11Z","title":"VibrantVS: A high-resolution multi-task transformer for forest canopy\n height estimation","summary":" This paper explores the application of a novel multi-task vision transformer\n(ViT) model for the estimation of canopy height models (CHMs) using 4-band\nNational Agriculture Imagery Program (NAIP) imagery across the western United\nStates. We compare the effectiveness of this model in terms of accuracy and\nprecision aggregated across ecoregions and class heights versus three other\nbenchmark peer-reviewed models. Key findings suggest that, while other\nbenchmark models can provide high precision in localized areas, the VibrantVS\nmodel has substantial advantages across a broad reach of ecoregions in the\nwestern United States with higher accuracy, higher precision, the ability to\ngenerate updated inference at a cadence of three years or less, and high\nspatial resolution. The VibrantVS model provides significant value for\necological monitoring and land management decisions for wildfire mitigation.\n","authors":["Tony Chang","Kiarie Ndegwa","Andreas Gros","Vincent A. Landau","Luke J. Zachmann","Bogdan State","Mitchell A. Gritts","Colton W. Miller","Nathan E. Rutenbeck","Scott Conway","Guy Bayes"],"pdf_url":"https://arxiv.org/pdf/2412.10351v2.pdf","comment":"15 pages, 12 figures"},{"id":"http://arxiv.org/abs/2501.07260v1","updated":"2025-01-13T12:18:58Z","published":"2025-01-13T12:18:58Z","title":"Skip Mamba Diffusion for Monocular 3D Semantic Scene Completion","summary":" 3D semantic scene completion is critical for multiple downstream tasks in\nautonomous systems. It estimates missing geometric and semantic information in\nthe acquired scene data. Due to the challenging real-world conditions, this\ntask usually demands complex models that process multi-modal data to achieve\nacceptable performance. We propose a unique neural model, leveraging advances\nfrom the state space and diffusion generative modeling to achieve remarkable 3D\nsemantic scene completion performance with monocular image input. Our technique\nprocesses the data in the conditioned latent space of a variational autoencoder\nwhere diffusion modeling is carried out with an innovative state space\ntechnique. A key component of our neural network is the proposed Skimba (Skip\nMamba) denoiser, which is adept at efficiently processing long-sequence data.\nThe Skimba diffusion model is integral to our 3D scene completion network,\nincorporating a triple Mamba structure, dimensional decomposition residuals and\nvarying dilations along three directions. We also adopt a variant of this\nnetwork for the subsequent semantic segmentation stage of our method. Extensive\nevaluation on the standard SemanticKITTI and SSCBench-KITTI360 datasets show\nthat our approach not only outperforms other monocular techniques by a large\nmargin, it also achieves competitive performance against stereo methods. The\ncode is available at https://github.com/xrkong/skimba\n","authors":["Li Liang","Naveed Akhtar","Jordan Vice","Xiangrui Kong","Ajmal Saeed Mian"],"pdf_url":"https://arxiv.org/pdf/2501.07260v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.07256v1","updated":"2025-01-13T12:11:07Z","published":"2025-01-13T12:11:07Z","title":"EdgeTAM: On-Device Track Anything Model","summary":" On top of Segment Anything Model (SAM), SAM 2 further extends its capability\nfrom image to video inputs through a memory bank mechanism and obtains a\nremarkable performance compared with previous methods, making it a foundation\nmodel for video segmentation task. In this paper, we aim at making SAM 2 much\nmore efficient so that it even runs on mobile devices while maintaining a\ncomparable performance. Despite several works optimizing SAM for better\nefficiency, we find they are not sufficient for SAM 2 because they all focus on\ncompressing the image encoder, while our benchmark shows that the newly\nintroduced memory attention blocks are also the latency bottleneck. Given this\nobservation, we propose EdgeTAM, which leverages a novel 2D Spatial Perceiver\nto reduce the computational cost. In particular, the proposed 2D Spatial\nPerceiver encodes the densely stored frame-level memories with a lightweight\nTransformer that contains a fixed set of learnable queries. Given that video\nsegmentation is a dense prediction task, we find preserving the spatial\nstructure of the memories is essential so that the queries are split into\nglobal-level and patch-level groups. We also propose a distillation pipeline\nthat further improves the performance without inference overhead. As a result,\nEdgeTAM achieves 87.7, 70.0, 72.3, and 71.7 J&F on DAVIS 2017, MOSE, SA-V val,\nand SA-V test, while running at 16 FPS on iPhone 15 Pro Max.\n","authors":["Chong Zhou","Chenchen Zhu","Yunyang Xiong","Saksham Suri","Fanyi Xiao","Lemeng Wu","Raghuraman Krishnamoorthi","Bo Dai","Chen Change Loy","Vikas Chandra","Bilge Soran"],"pdf_url":"https://arxiv.org/pdf/2501.07256v1.pdf","comment":"Code will be released at https://github.com/facebookresearch/EdgeTAM"},{"id":"http://arxiv.org/abs/2501.07251v1","updated":"2025-01-13T12:00:34Z","published":"2025-01-13T12:00:34Z","title":"MOS-Attack: A Scalable Multi-objective Adversarial Attack Framework","summary":" Crafting adversarial examples is crucial for evaluating and enhancing the\nrobustness of Deep Neural Networks (DNNs), presenting a challenge equivalent to\nmaximizing a non-differentiable 0-1 loss function.\n However, existing single objective methods, namely adversarial attacks focus\non a surrogate loss function, do not fully harness the benefits of engaging\nmultiple loss functions, as a result of insufficient understanding of their\nsynergistic and conflicting nature.\n To overcome these limitations, we propose the Multi-Objective Set-based\nAttack (MOS Attack), a novel adversarial attack framework leveraging multiple\nloss functions and automatically uncovering their interrelations.\n The MOS Attack adopts a set-based multi-objective optimization strategy,\nenabling the incorporation of numerous loss functions without additional\nparameters.\n It also automatically mines synergistic patterns among various losses,\nfacilitating the generation of potent adversarial attacks with fewer\nobjectives.\n Extensive experiments have shown that our MOS Attack outperforms\nsingle-objective attacks. Furthermore, by harnessing the identified synergistic\npatterns, MOS Attack continues to show superior results with a reduced number\nof loss functions.\n","authors":["Ping Guo","Cheng Gong","Xi Lin","Fei Liu","Zhichao Lu","Qingfu Zhang","Zhenkun Wang"],"pdf_url":"https://arxiv.org/pdf/2501.07251v1.pdf","comment":"Under Review of CVPR 2025"},{"id":"http://arxiv.org/abs/2501.07248v1","updated":"2025-01-13T11:58:02Z","published":"2025-01-13T11:58:02Z","title":"Implicit Neural Representations for Registration of Left Ventricle\n Myocardium During a Cardiac Cycle","summary":" Understanding the movement of the left ventricle myocardium (LVmyo) during\nthe cardiac cycle is essential for assessing cardiac function. One way to model\nthis movement is through a series of deformable image registrations (DIRs) of\nthe LVmyo. Traditional deep learning methods for DIRs, such as those based on\nconvolutional neural networks, often require substantial memory and\ncomputational resources. In contrast, implicit neural representations (INRs)\noffer an efficient approach by operating on any number of continuous points.\nThis study extends the use of INRs for DIR to cardiac computed tomography (CT),\nfocusing on LVmyo registration. To enhance the precision of the registration\naround the LVmyo, we incorporate the signed distance field of the LVmyo with\nthe Hounsfield Unit values from the CT frames. This guides the registration of\nthe LVmyo, while keeping the tissue information from the CT frames. Our\nframework demonstrates high registration accuracy and provides a robust method\nfor temporal registration that facilitates further analysis of LVmyo motion.\n","authors":["Mathias Micheelsen Lowes","Jonas Jalili Pedersen","Bjørn S. Hansen","Klaus Fuglsang Kofoed","Maxime Sermesant","Rasmus R. Paulsen"],"pdf_url":"https://arxiv.org/pdf/2501.07248v1.pdf","comment":"9 pages, 5 figures, STACOM 2024"},{"id":"http://arxiv.org/abs/2501.07245v1","updated":"2025-01-13T11:54:26Z","published":"2025-01-13T11:54:26Z","title":"Depth and Image Fusion for Road Obstacle Detection Using Stereo Camera","summary":" This paper is devoted to the detection of objects on a road, performed with a\ncombination of two methods based on both the use of depth information and video\nanalysis of data from a stereo camera. Since neither the time of the appearance\nof an object on the road, nor its size and shape is known in advance,\nML/DL-based approaches are not applicable. The task becomes more complicated\ndue to variations in artificial illumination, inhomogeneous road surface\ntexture, and unknown character and features of the object. To solve this\nproblem we developed the depth and image fusion method that complements a\nsearch of small contrast objects by RGB-based method, and obstacle detection by\nstereo image-based approach with SLIC superpixel segmentation. We conducted\nexperiments with static and low speed obstacles in an underground parking lot\nand demonstrated the successful work of the developed technique for detecting\nand even tracking small objects, which can be parking infrastructure objects,\nthings left on the road, wheels, dropped boxes, etc.\n","authors":["Oleg Perezyabov","Mikhail Gavrilenkov","Ilya Afanasyev"],"pdf_url":"https://arxiv.org/pdf/2501.07245v1.pdf","comment":"8 pages, 15 figures"},{"id":"http://arxiv.org/abs/2501.07244v1","updated":"2025-01-13T11:52:55Z","published":"2025-01-13T11:52:55Z","title":"Can Vision-Language Models Evaluate Handwritten Math?","summary":" Recent advancements in Vision-Language Models (VLMs) have opened new\npossibilities in automatic grading of handwritten student responses,\nparticularly in mathematics. However, a comprehensive study to test the ability\nof VLMs to evaluate and reason over handwritten content remains absent. To\naddress this gap, we introduce FERMAT, a benchmark designed to assess the\nability of VLMs to detect, localize and correct errors in handwritten\nmathematical content. FERMAT spans four key error dimensions - computational,\nconceptual, notational, and presentation - and comprises over 2,200 handwritten\nmath solutions derived from 609 manually curated problems from grades 7-12 with\nintentionally introduced perturbations. Using FERMAT we benchmark nine VLMs\nacross three tasks: error detection, localization, and correction. Our results\nreveal significant shortcomings in current VLMs in reasoning over handwritten\ntext, with Gemini-1.5-Pro achieving the highest error correction rate (77%). We\nalso observed that some models struggle with processing handwritten content, as\ntheir accuracy improves when handwritten inputs are replaced with printed text\nor images. These findings highlight the limitations of current VLMs and reveal\nnew avenues for improvement. We release FERMAT and all the associated resources\nin the open-source to drive further research.\n","authors":["Oikantik Nath","Hanani Bathina","Mohammed Safi Ur Rahman Khan","Mitesh M. Khapra"],"pdf_url":"https://arxiv.org/pdf/2501.07244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.20104v2","updated":"2025-01-13T11:46:06Z","published":"2024-12-28T10:12:12Z","title":"SyncDiff: Synchronized Motion Diffusion for Multi-Body Human-Object\n Interaction Synthesis","summary":" Synthesizing realistic human-object interaction motions is a critical problem\nin VR/AR and human animation. Unlike the commonly studied scenarios involving a\nsingle human or hand interacting with one object, we address a more generic\nmulti-body setting with arbitrary numbers of humans, hands, and objects. This\ncomplexity introduces significant challenges in synchronizing motions due to\nthe high correlations and mutual influences among bodies. To address these\nchallenges, we introduce SyncDiff, a novel method for multi-body interaction\nsynthesis using a synchronized motion diffusion strategy. SyncDiff employs a\nsingle diffusion model to capture the joint distribution of multi-body motions.\nTo enhance motion fidelity, we propose a frequency-domain motion decomposition\nscheme. Additionally, we introduce a new set of alignment scores to emphasize\nthe synchronization of different body motions. SyncDiff jointly optimizes both\ndata sample likelihood and alignment likelihood through an explicit\nsynchronization strategy. Extensive experiments across four datasets with\nvarious multi-body configurations demonstrate the superiority of SyncDiff over\nexisting state-of-the-art motion synthesis methods.\n","authors":["Wenkun He","Yun Liu","Ruitao Liu","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2412.20104v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07236v1","updated":"2025-01-13T11:34:55Z","published":"2025-01-13T11:34:55Z","title":"CSTA: Spatial-Temporal Causal Adaptive Learning for Exemplar-Free Video\n Class-Incremental Learning","summary":" Continual learning aims to acquire new knowledge while retaining past\ninformation. Class-incremental learning (CIL) presents a challenging scenario\nwhere classes are introduced sequentially. For video data, the task becomes\nmore complex than image data because it requires learning and preserving both\nspatial appearance and temporal action involvement. To address this challenge,\nwe propose a novel exemplar-free framework that equips separate spatiotemporal\nadapters to learn new class patterns, accommodating the incremental information\nrepresentation requirements unique to each class. While separate adapters are\nproven to mitigate forgetting and fit unique requirements, naively applying\nthem hinders the intrinsic connection between spatial and temporal information\nincrements, affecting the efficiency of representing newly learned class\ninformation. Motivated by this, we introduce two key innovations from a causal\nperspective. First, a causal distillation module is devised to maintain the\nrelation between spatial-temporal knowledge for a more efficient\nrepresentation. Second, a causal compensation mechanism is proposed to reduce\nthe conflicts during increment and memorization between different types of\ninformation. Extensive experiments conducted on benchmark datasets demonstrate\nthat our framework can achieve new state-of-the-art results, surpassing current\nexample-based methods by 4.2% in accuracy on average.\n","authors":["Tieyuan Chen","Huabin Liu","Chern Hong Lim","John See","Xing Gao","Junhui Hou","Weiyao Lin"],"pdf_url":"https://arxiv.org/pdf/2501.07236v1.pdf","comment":"IEEE TCSVT Submission"},{"id":"http://arxiv.org/abs/2501.07227v1","updated":"2025-01-13T11:28:49Z","published":"2025-01-13T11:28:49Z","title":"MECD+: Unlocking Event-Level Causal Graph Discovery for Video Reasoning","summary":" Video causal reasoning aims to achieve a high-level understanding of videos\nfrom a causal perspective. However, it exhibits limitations in its scope,\nprimarily executed in a question-answering paradigm and focusing on brief video\nsegments containing isolated events and basic causal relations, lacking\ncomprehensive and structured causality analysis for videos with multiple\ninterconnected events. To fill this gap, we introduce a new task and dataset,\nMulti-Event Causal Discovery (MECD). It aims to uncover the causal relations\nbetween events distributed chronologically across long videos. Given visual\nsegments and textual descriptions of events, MECD identifies the causal\nassociations between these events to derive a comprehensive and structured\nevent-level video causal graph explaining why and how the result event\noccurred. To address the challenges of MECD, we devise a novel framework\ninspired by the Granger Causality method, incorporating an efficient mask-based\nevent prediction model to perform an Event Granger Test. It estimates causality\nby comparing the predicted result event when premise events are masked versus\nunmasked. Furthermore, we integrate causal inference techniques such as\nfront-door adjustment and counterfactual inference to mitigate challenges in\nMECD like causality confounding and illusory causality. Additionally, context\nchain reasoning is introduced to conduct more robust and generalized reasoning.\nExperiments validate the effectiveness of our framework in reasoning complete\ncausal relations, outperforming GPT-4o and VideoChat2 by 5.77% and 2.70%,\nrespectively. Further experiments demonstrate that causal relation graphs can\nalso contribute to downstream video understanding tasks such as video question\nanswering and video event prediction.\n","authors":["Tieyuan Chen","Huabin Liu","Yi Wang","Yihang Chen","Tianyao He","Chaofan Gan","Huanyu He","Weiyao Lin"],"pdf_url":"https://arxiv.org/pdf/2501.07227v1.pdf","comment":"IEEE TPAMI Submission. arXiv admin note: substantial text overlap\n with arXiv:2409.17647"},{"id":"http://arxiv.org/abs/2501.07221v1","updated":"2025-01-13T11:20:44Z","published":"2025-01-13T11:20:44Z","title":"Exploring the Use of Contrastive Language-Image Pre-Training for Human\n Posture Classification: Insights from Yoga Pose Analysis","summary":" Accurate human posture classification in images and videos is crucial for\nautomated applications across various fields, including work safety, physical\nrehabilitation, sports training, or daily assisted living. Recently, multimodal\nlearning methods, such as Contrastive Language-Image Pretraining (CLIP), have\nadvanced significantly in jointly understanding images and text. This study\naims to assess the effectiveness of CLIP in classifying human postures,\nfocusing on its application in yoga. Despite the initial limitations of the\nzero-shot approach, applying transfer learning on 15,301 images (real and\nsynthetic) with 82 classes has shown promising results. The article describes\nthe full procedure for fine-tuning, including the choice for image description\nsyntax, models and hyperparameters adjustment. The fine-tuned CLIP model,\ntested on 3826 images, achieves an accuracy of over 85%, surpassing the current\nstate-of-the-art of previous works on the same dataset by approximately 6%, its\ntraining time being 3.5 times lower than what is needed to fine-tune a\nYOLOv8-based model. For more application-oriented scenarios, with smaller\ndatasets of six postures each, containing 1301 and 401 training images, the\nfine-tuned models attain an accuracy of 98.8% and 99.1%, respectively.\nFurthermore, our experiments indicate that training with as few as 20 images\nper pose can yield around 90% accuracy in a six-class dataset. This study\ndemonstrates that this multimodal technique can be effectively used for yoga\npose classification, and possibly for human posture classification, in general.\nAdditionally, CLIP inference time (around 7 ms) supports that the model can be\nintegrated into automated systems for posture evaluation, e.g., for developing\na real-time personal yoga assistant for performance assessment.\n","authors":["Andrzej D. Dobrzycki","Ana M. Bernardos","Luca Bergesio","Andrzej Pomirski","Daniel Sáez-Trigueros"],"pdf_url":"https://arxiv.org/pdf/2501.07221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07214v1","updated":"2025-01-13T11:12:59Z","published":"2025-01-13T11:12:59Z","title":"TimeLogic: A Temporal Logic Benchmark for Video QA","summary":" Temporal logical understanding, a core facet of human cognition, plays a\npivotal role in capturing complex sequential events and their temporal\nrelationships within videos. This capability is particularly crucial in tasks\nlike Video Question Answering (VideoQA), where the goal is to process visual\ndata over time together with textual data to provide coherent answers. However,\ncurrent VideoQA benchmarks devote little focus to evaluating this critical\nskill due to the challenge of annotating temporal logic. Despite the\nadvancement of vision-language models, assessing their temporal logical\nreasoning powers remains a challenge, primarily due to the lack QA pairs that\ndemand formal, complex temporal reasoning. To bridge this gap, we introduce the\nTimeLogic QA (TLQA) framework to automatically generate the QA pairs,\nspecifically designed to evaluate the temporal logical understanding. To this\nend, TLQA leverages temporal annotations from existing video datasets together\nwith temporal operators derived from logic theory to construct questions that\ntest understanding of event sequences and their temporal relationships. TLQA\nframework is generic and scalable, capable of leveraging both, existing video\naction datasets with temporal action segmentation annotations, or video\ndatasets with temporal scene graph annotations, to automatically generate\ntemporal logical questions. We leverage 4 datasets, STAR, Breakfast, AGQA, and\nCrossTask, and generate two VideoQA dataset variants - small (TLQA-S) and large\n(TLQA-L) - containing 2k and 10k QA pairs for each category, resulting in 32k\nand 160k total pairs per dataset. We undertake a comprehensive evaluation of\nleading-edge VideoQA models, employing the TLQA to benchmark their temporal\nlogical understanding capabilities. We assess the VideoQA model's temporal\nreasoning performance on 16 categories of temporal logic with varying temporal\ncomplexity.\n","authors":["Sirnam Swetha","Hilde Kuehne","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2501.07214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07213v1","updated":"2025-01-13T11:12:47Z","published":"2025-01-13T11:12:47Z","title":"Multi-face emotion detection for effective Human-Robot Interaction","summary":" The integration of dialogue interfaces in mobile devices has become\nubiquitous, providing a wide array of services. As technology progresses,\nhumanoid robots designed with human-like features to interact effectively with\npeople are gaining prominence, and the use of advanced human-robot dialogue\ninterfaces is continually expanding. In this context, emotion recognition plays\na crucial role in enhancing human-robot interaction by enabling robots to\nunderstand human intentions. This research proposes a facial emotion detection\ninterface integrated into a mobile humanoid robot, capable of displaying\nreal-time emotions from multiple individuals on a user interface. To this end,\nvarious deep neural network models for facial expression recognition were\ndeveloped and evaluated under consistent computer-based conditions, yielding\npromising results. Afterwards, a trade-off between accuracy and memory\nfootprint was carefully considered to effectively implement this application on\na mobile humanoid robot.\n","authors":["Mohamed Ala Yahyaoui","Mouaad Oujabour","Leila Ben Letaifa","Amine Bohi"],"pdf_url":"https://arxiv.org/pdf/2501.07213v1.pdf","comment":"9 pages, 8 figures and 1 table. Accepted at the 17th International\n Conference on Agents and Artificial Intelligence (ICAART 2025), Porto,\n Portugal"},{"id":"http://arxiv.org/abs/2501.07202v1","updated":"2025-01-13T10:53:48Z","published":"2025-01-13T10:53:48Z","title":"FaceOracle: Chat with a Face Image Oracle","summary":" A face image is a mandatory part of ID and travel documents. Obtaining\nhigh-quality face images when issuing such documents is crucial for both human\nexaminers and automated face recognition systems. In several international\nstandards, face image quality requirements are intricate and defined in detail.\nIdentifying and understanding non-compliance or defects in the submitted face\nimages is crucial for both issuing authorities and applicants. In this work, we\nintroduce FaceOracle, an LLM-powered AI assistant that helps its users analyze\na face image in a natural conversational manner using standard compliant\nalgorithms. Leveraging the power of LLMs, users can get explanations of various\nface image quality concepts as well as interpret the outcome of face image\nquality assessment (FIQA) algorithms. We implement a proof-of-concept that\ndemonstrates how experts at an issuing authority could integrate FaceOracle\ninto their workflow to analyze, understand, and communicate their decisions\nmore efficiently, resulting in enhanced productivity.\n","authors":["Wassim Kabbani","Kiran Raja","Raghavendra Ramachandra","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2501.07202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07197v1","updated":"2025-01-13T10:44:08Z","published":"2025-01-13T10:44:08Z","title":"Lung Cancer detection using Deep Learning","summary":" In this paper we discuss lung cancer detection using hybrid model of\nConvolutional-Neural-Networks (CNNs) and Support-Vector-Machines-(SVMs) in\norder to gain early detection of tumors, benign or malignant. The work uses\nthis hybrid model by training upon the Computed Tomography scans (CT scans) as\ndataset. Using deep learning for detecting lung cancer early is a cutting-edge\nmethod.\n","authors":["Aryan Chaudhari","Ankush Singh","Sanchi Gajbhiye","Pratham Agrawal"],"pdf_url":"https://arxiv.org/pdf/2501.07197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07194v1","updated":"2025-01-13T10:42:18Z","published":"2025-01-13T10:42:18Z","title":"VAGeo: View-specific Attention for Cross-View Object Geo-Localization","summary":" Cross-view object geo-localization (CVOGL) aims to locate an object of\ninterest in a captured ground- or drone-view image within the satellite image.\nHowever, existing works treat ground-view and drone-view query images\nequivalently, overlooking their inherent viewpoint discrepancies and the\nspatial correlation between the query image and the satellite-view reference\nimage. To this end, this paper proposes a novel View-specific Attention\nGeo-localization method (VAGeo) for accurate CVOGL. Specifically, VAGeo\ncontains two key modules: view-specific positional encoding (VSPE) module and\nchannel-spatial hybrid attention (CSHA) module. In object-level, according to\nthe characteristics of different viewpoints of ground and drone query images,\nviewpoint-specific positional codings are designed to more accurately identify\nthe click-point object of the query image in the VSPE module. In feature-level,\na hybrid attention in the CSHA module is introduced by combining channel\nattention and spatial attention mechanisms simultaneously for learning\ndiscriminative features. Extensive experimental results demonstrate that the\nproposed VAGeo gains a significant performance improvement, i.e., improving\nacc@0.25/acc@0.5 on the CVOGL dataset from 45.43%/42.24% to 48.21%/45.22% for\nground-view, and from 61.97%/57.66% to 66.19%/61.87% for drone-view.\n","authors":["Zhongyang Li","Xin Yuan","Wei Liu","Xin Xu"],"pdf_url":"https://arxiv.org/pdf/2501.07194v1.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2411.11543v4","updated":"2025-01-13T10:39:04Z","published":"2024-11-18T13:01:57Z","title":"PSA-VLM: Enhancing Vision-Language Model Safety through Progressive\n Concept-Bottleneck-Driven Alignment","summary":" Benefiting from the powerful capabilities of Large Language Models (LLMs),\npre-trained visual encoder models connected to LLMs form Vision Language Models\n(VLMs). However, recent research shows that the visual modality in VLMs is\nhighly vulnerable, allowing attackers to bypass safety alignment in LLMs\nthrough visually transmitted content, launching harmful attacks. To address\nthis challenge, we propose a progressive concept-based alignment strategy,\nPSA-VLM, which incorporates safety modules as concept bottlenecks to enhance\nvisual modality safety alignment. By aligning model predictions with specific\nsafety concepts, we improve defenses against risky images, enhancing\nexplainability and controllability while minimally impacting general\nperformance. Our method is obtained through two-stage training. The low\ncomputational cost of the first stage brings very effective performance\nimprovement, and the fine-tuning of the language model in the second stage\nfurther improves the safety performance. Our method achieves state-of-the-art\nresults on popular VLM safety benchmark.\n","authors":["Zhendong Liu","Yuanbi Nie","Yingshui Tan","Jiaheng Liu","Xiangyu Yue","Qiushi Cui","Chongjun Wang","Xiaoyong Zhu","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2411.11543v4.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2405.13581"},{"id":"http://arxiv.org/abs/2501.07192v1","updated":"2025-01-13T10:38:58Z","published":"2025-01-13T10:38:58Z","title":"A4O: All Trigger for One sample","summary":" Backdoor attacks have become a critical threat to deep neural networks\n(DNNs), drawing many research interests. However, most of the studied attacks\nemploy a single type of trigger. Consequently, proposed backdoor defenders\noften rely on the assumption that triggers would appear in a unified way. In\nthis paper, we show that this naive assumption can create a loophole, allowing\nmore sophisticated backdoor attacks to bypass. We design a novel backdoor\nattack mechanism that incorporates multiple types of backdoor triggers,\nfocusing on stealthiness and effectiveness. Our journey begins with the\nintriguing observation that the performance of a backdoor attack in deep\nlearning models, as well as its detectability and removability, are all\nproportional to the magnitude of the trigger. Based on this correlation, we\npropose reducing the magnitude of each trigger type and combining them to\nachieve a strong backdoor relying on the combined trigger while still staying\nsafely under the radar of defenders. Extensive experiments on three standard\ndatasets demonstrate that our method can achieve high attack success rates\n(ASRs) while consistently bypassing state-of-the-art defenses.\n","authors":["Duc Anh Vu","Anh Tuan Tran","Cong Tran","Cuong Pham"],"pdf_url":"https://arxiv.org/pdf/2501.07192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05767v2","updated":"2025-01-13T10:38:32Z","published":"2025-01-10T07:56:23Z","title":"Migician: Revealing the Magic of Free-Form Multi-Image Grounding in\n Multimodal Large Language Models","summary":" The recent advancement of Multimodal Large Language Models (MLLMs) has\nsignificantly improved their fine-grained perception of single images and\ngeneral comprehension across multiple images. However, existing MLLMs still\nface challenges in achieving precise grounding in complex multi-image\nscenarios. To address this, we first explore a Chain-of-Thought (CoT) framework\nthat integrates single-image grounding with multi-image comprehension. While\npartially effective, it remains unstable and struggles to capture abstract\nvisual information due to its non-end-to-end nature. Therefore, we introduce\nMigician, the first multi-image grounding model capable of performing free-form\nand accurate grounding across multiple images. To support this, we present the\nMGrounding-630k dataset, which comprises data for several multi-image grounding\ntasks derived from existing datasets, along with newly generated free-form\ngrounding instruction-following data. Furthermore, we propose MIG-Bench, a\ncomprehensive benchmark specifically designed for evaluating multi-image\ngrounding capabilities. Experimental results demonstrate that our model\nachieves significantly superior multi-image grounding capabilities,\noutperforming the best existing MLLMs by 21.61% and even surpassing much larger\n70B models. Our code, model, dataset, and benchmark are fully open-sourced at\nhttps://migician-vg.github.io/.\n","authors":["You Li","Heyu Huang","Chi Chen","Kaiyu Huang","Chao Huang","Zonghao Guo","Zhiyuan Liu","Jinan Xu","Yuhua Li","Ruixuan Li","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2501.05767v2.pdf","comment":"20 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.07185v1","updated":"2025-01-13T10:30:10Z","published":"2025-01-13T10:30:10Z","title":"Uncertainty Guarantees on Automated Precision Weeding using Conformal\n Prediction","summary":" Precision agriculture in general, and precision weeding in particular, have\ngreatly benefited from the major advancements in deep learning and computer\nvision. A large variety of commercial robotic solutions are already available\nand deployed. However, the adoption by farmers of such solutions is still low\nfor many reasons, an important one being the lack of trust in these systems.\nThis is in great part due to the opaqueness and complexity of deep neural\nnetworks and the manufacturers' inability to provide valid guarantees on their\nperformance. Conformal prediction, a well-established methodology in the\nmachine learning community, is an efficient and reliable strategy for providing\ntrustworthy guarantees on the predictions of any black-box model under very\nminimal constraints. Bridging the gap between the safe machine learning and\nprecision agriculture communities, this article showcases conformal prediction\nin action on the task of precision weeding through deep learning-based image\nclassification. After a detailed presentation of the conformal prediction\nmethodology and the development of a precision spraying pipeline based on a\n''conformalized'' neural network and well-defined spraying decision rules, the\narticle evaluates this pipeline on two real-world scenarios: one under\nin-distribution conditions, the other reflecting a near out-of-distribution\nsetting. The results show that we are able to provide formal, i.e. certifiable,\nguarantees on spraying at least 90% of the weeds.\n","authors":["Paul Melki","Lionel Bombrun","Boubacar Diallo","Jérôme Dias","Jean-Pierre da Costa"],"pdf_url":"https://arxiv.org/pdf/2501.07185v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07179v1","updated":"2025-01-13T10:19:16Z","published":"2025-01-13T10:19:16Z","title":"Radial Distortion in Face Images: Detection and Impact","summary":" Acquiring face images of sufficiently high quality is important for online ID\nand travel document issuance applications using face recognition systems (FRS).\nLow-quality, manipulated (intentionally or unintentionally), or distorted\nimages degrade the FRS performance and facilitate documents' misuse. Securing\nquality for enrolment images, especially in the unsupervised self-enrolment\nscenario via a smartphone, becomes important to assure FRS performance. In this\nwork, we focus on the less studied area of radial distortion (a.k.a., the\nfish-eye effect) in face images and its impact on FRS performance. We introduce\nan effective radial distortion detection model that can detect and flag radial\ndistortion in the enrolment scenario. We formalize the detection model as a\nface image quality assessment (FIQA) algorithm and provide a careful inspection\nof the effect of radial distortion on FRS performance. Evaluation results show\nexcellent detection results for the proposed models, and the study on the\nimpact on FRS uncovers valuable insights into how to best use these models in\noperational systems.\n","authors":["Wassim Kabbani","Tristan Le Pessot","Kiran Raja","Raghavendra Ramachandra","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2501.07179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20971v2","updated":"2025-01-13T10:14:27Z","published":"2024-05-31T16:18:46Z","title":"Amortizing intractable inference in diffusion models for vision,\n language, and control","summary":" Diffusion models have emerged as effective distribution estimators in vision,\nlanguage, and reinforcement learning, but their use as priors in downstream\ntasks poses an intractable posterior inference problem. This paper studies\namortized sampling of the posterior over data, $\\mathbf{x}\\sim p^{\\rm\npost}(\\mathbf{x})\\propto p(\\mathbf{x})r(\\mathbf{x})$, in a model that consists\nof a diffusion generative model prior $p(\\mathbf{x})$ and a black-box\nconstraint or likelihood function $r(\\mathbf{x})$. We state and prove the\nasymptotic correctness of a data-free learning objective, relative trajectory\nbalance, for training a diffusion model that samples from this posterior, a\nproblem that existing methods solve only approximately or in restricted cases.\nRelative trajectory balance arises from the generative flow network perspective\non diffusion models, which allows the use of deep reinforcement learning\ntechniques to improve mode coverage. Experiments illustrate the broad potential\nof unbiased inference of arbitrary posteriors under diffusion priors: in vision\n(classifier guidance), language (infilling under a discrete diffusion LLM), and\nmultimodal data (text-to-image generation). Beyond generative modeling, we\napply relative trajectory balance to the problem of continuous control with a\nscore-based behavior prior, achieving state-of-the-art results on benchmarks in\noffline reinforcement learning.\n","authors":["Siddarth Venkatraman","Moksh Jain","Luca Scimeca","Minsu Kim","Marcin Sendera","Mohsin Hasan","Luke Rowe","Sarthak Mittal","Pablo Lemos","Emmanuel Bengio","Alexandre Adam","Jarrid Rector-Brooks","Yoshua Bengio","Glen Berseth","Nikolay Malkin"],"pdf_url":"https://arxiv.org/pdf/2405.20971v2.pdf","comment":"NeurIPS 2024; code: https://github.com/GFNOrg/diffusion-finetuning"},{"id":"http://arxiv.org/abs/2412.15523v2","updated":"2025-01-13T10:01:56Z","published":"2024-12-20T03:23:26Z","title":"InstructOCR: Instruction Boosting Scene Text Spotting","summary":" In the field of scene text spotting, previous OCR methods primarily relied on\nimage encoders and pre-trained text information, but they often overlooked the\nadvantages of incorporating human language instructions. To address this gap,\nwe propose InstructOCR, an innovative instruction-based scene text spotting\nmodel that leverages human language instructions to enhance the understanding\nof text within images. Our framework employs both text and image encoders\nduring training and inference, along with instructions meticulously designed\nbased on text attributes. This approach enables the model to interpret text\nmore accurately and flexibly. Extensive experiments demonstrate the\neffectiveness of our model and we achieve state-of-the-art results on widely\nused benchmarks. Furthermore, the proposed framework can be seamlessly applied\nto scene text VQA tasks. By leveraging instruction strategies during\npre-training, the performance on downstream VQA tasks can be significantly\nimproved, with a 2.6% increase on the TextVQA dataset and a 2.1% increase on\nthe ST-VQA dataset. These experimental results provide insights into the\nbenefits of incorporating human language instructions for OCR-related tasks.\n","authors":["Chen Duan","Qianyi Jiang","Pei Fu","Jiamin Chen","Shengxi Li","Zining Wang","Shan Guo","Junfeng Luo"],"pdf_url":"https://arxiv.org/pdf/2412.15523v2.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2501.07171v1","updated":"2025-01-13T09:58:03Z","published":"2025-01-13T09:58:03Z","title":"BIOMEDICA: An Open Biomedical Image-Caption Archive, Dataset, and\n Vision-Language Models Derived from Scientific Literature","summary":" The development of vision-language models (VLMs) is driven by large-scale and\ndiverse multimodal datasets. However, progress toward generalist biomedical\nVLMs is limited by the lack of annotated, publicly accessible datasets across\nbiology and medicine. Existing efforts are restricted to narrow domains,\nmissing the full diversity of biomedical knowledge encoded in scientific\nliterature. To address this gap, we introduce BIOMEDICA, a scalable,\nopen-source framework to extract, annotate, and serialize the entirety of the\nPubMed Central Open Access subset into an easy-to-use, publicly accessible\ndataset.Our framework produces a comprehensive archive with over 24 million\nunique image-text pairs from over 6 million articles. Metadata and\nexpert-guided annotations are also provided. We demonstrate the utility and\naccessibility of our resource by releasing BMCA-CLIP, a suite of CLIP-style\nmodels continuously pre-trained on the BIOMEDICA dataset via streaming,\neliminating the need to download 27 TB of data locally.On average, our models\nachieve state-of-the-art performance across 40 tasks - spanning pathology,\nradiology, ophthalmology, dermatology, surgery, molecular biology,\nparasitology, and cell biology - excelling in zero-shot classification with a\n6.56% average improvement (as high as 29.8% and 17.5% in dermatology and\nophthalmology, respectively), and stronger image-text retrieval, all while\nusing 10x less compute. To foster reproducibility and collaboration, we release\nour codebase and dataset for the broader research community.\n","authors":["Alejandro Lozano","Min Woo Sun","James Burgess","Liangyu Chen","Jeffrey J Nirschl","Jeffrey Gu","Ivan Lopez","Josiah Aklilu","Austin Wolfgang Katzer","Collin Chiu","Anita Rau","Xiaohan Wang","Yuhui Zhang","Alfred Seunghoon Song","Robert Tibshirani","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2501.07171v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07163v1","updated":"2025-01-13T09:49:34Z","published":"2025-01-13T09:49:34Z","title":"Adaptive Noise-Tolerant Network for Image Segmentation","summary":" Unlike image classification and annotation, for which deep network models\nhave achieved dominating superior performances compared to traditional computer\nvision algorithms, deep learning for automatic image segmentation still faces\ncritical challenges. One of such hurdles is to obtain ground-truth\nsegmentations as the training labels for deep network training. Especially when\nwe study biomedical images, such as histopathological images (histo-images), it\nis unrealistic to ask for manual segmentation labels as the ground truth for\ntraining due to the fine image resolution as well as the large image size and\ncomplexity. In this paper, instead of relying on clean segmentation labels, we\nstudy whether and how integrating imperfect or noisy segmentation results from\noff-the-shelf segmentation algorithms may help achieve better segmentation\nresults through a new Adaptive Noise-Tolerant Network (ANTN) model. We extend\nthe noisy label deep learning to image segmentation with two novel aspects: (1)\nmultiple noisy labels can be integrated into one deep learning model; (2) noisy\nsegmentation modeling, including probabilistic parameters, is adaptive,\ndepending on the given testing image appearance. Implementation of the new ANTN\nmodel on both the synthetic data and real-world histo-images demonstrates its\neffectiveness and superiority over off-the-shelf and other existing\ndeep-learning-based image segmentation algorithms.\n","authors":["Weizhi Li"],"pdf_url":"https://arxiv.org/pdf/2501.07163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05862v3","updated":"2025-01-13T09:33:47Z","published":"2024-06-09T17:25:47Z","title":"II-Bench: An Image Implication Understanding Benchmark for Multimodal\n Large Language Models","summary":" The rapid advancements in the development of multimodal large language models\n(MLLMs) have consistently led to new breakthroughs on various benchmarks. In\nresponse, numerous challenging and comprehensive benchmarks have been proposed\nto more accurately assess the capabilities of MLLMs. However, there is a dearth\nof exploration of the higher-order perceptual capabilities of MLLMs. To fill\nthis gap, we propose the Image Implication understanding Benchmark, II-Bench,\nwhich aims to evaluate the model's higher-order perception of images. Through\nextensive experiments on II-Bench across multiple MLLMs, we have made\nsignificant findings. Initially, a substantial gap is observed between the\nperformance of MLLMs and humans on II-Bench. The pinnacle accuracy of MLLMs\nattains 74.8%, whereas human accuracy averages 90%, peaking at an impressive\n98%. Subsequently, MLLMs perform worse on abstract and complex images,\nsuggesting limitations in their ability to understand high-level semantics and\ncapture image details. Finally, it is observed that most models exhibit\nenhanced accuracy when image sentiment polarity hints are incorporated into the\nprompts. This observation underscores a notable deficiency in their inherent\nunderstanding of image sentiment. We believe that II-Bench will inspire the\ncommunity to develop the next generation of MLLMs, advancing the journey\ntowards expert artificial general intelligence (AGI). II-Bench is publicly\navailable at https://huggingface.co/datasets/m-a-p/II-Bench.\n","authors":["Ziqiang Liu","Feiteng Fang","Xi Feng","Xinrun Du","Chenhao Zhang","Zekun Wang","Yuelin Bai","Qixuan Zhao","Liyang Fan","Chengguang Gan","Hongquan Lin","Jiaming Li","Yuansheng Ni","Haihong Wu","Yaswanth Narsupalli","Zhigang Zheng","Chengming Li","Xiping Hu","Ruifeng Xu","Xiaojun Chen","Min Yang","Jiaheng Liu","Ruibo Liu","Wenhao Huang","Ge Zhang","Shiwen Ni"],"pdf_url":"https://arxiv.org/pdf/2406.05862v3.pdf","comment":"100 pages, 82 figures, add citations"},{"id":"http://arxiv.org/abs/2501.07158v1","updated":"2025-01-13T09:33:03Z","published":"2025-01-13T09:33:03Z","title":"Eye Sclera for Fair Face Image Quality Assessment","summary":" Fair operational systems are crucial in gaining and maintaining society's\ntrust in face recognition systems (FRS). FRS start with capturing an image and\nassessing its quality before using it further for enrollment or verification.\nFair Face Image Quality Assessment (FIQA) schemes therefore become equally\nimportant in the context of fair FRS. This work examines the sclera as a\nquality assessment region for obtaining a fair FIQA. The sclera region is\nagnostic to demographic variations and skin colour for assessing the quality of\na face image. We analyze three skin tone related ISO/IEC face image quality\nassessment measures and assess the sclera region as an alternative area for\nassessing FIQ. Our analysis of the face dataset of individuals from different\ndemographic groups representing different skin tones indicates sclera as an\nalternative to measure dynamic range, over- and under-exposure of face using\nsclera region alone. The sclera region being agnostic to skin tone, i.e.,\ndemographic factors, provides equal utility as a fair FIQA as shown by our\nError-vs-Discard Characteristic (EDC) curve analysis.\n","authors":["Wassim Kabbani","Kiran Raja","Raghavendra Ramachandra","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2501.07158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14432v2","updated":"2025-01-13T09:26:17Z","published":"2024-09-22T13:11:08Z","title":"EM-DARTS: Hierarchical Differentiable Architecture Search for Eye\n Movement Recognition","summary":" Eye movement biometrics has received increasing attention thanks to its\nhighly secure identification. Although deep learning (DL) models have shown\nsuccess in eye movement recognition, their architectures largely rely on human\nprior knowledge. Differentiable Neural Architecture Search (DARTS) automates\nthe manual process of architecture design with high search efficiency. However,\nDARTS typically stacks multiple cells to form a convolutional network, which\nlimits the diversity of architecture. Furthermore, DARTS generally searches for\narchitectures using shallower networks than those used in the evaluation,\ncreating a significant disparity in architecture depth between the search and\nevaluation phases. To address this issue, we propose EM-DARTS, a hierarchical\ndifferentiable architecture search algorithm to automatically design the DL\narchitecture for eye movement recognition. First, we define a supernet and\npropose a global and local alternate Neural Architecture Search method to\nsearch the optimal architecture alternately with a differentiable neural\narchitecture search. The local search strategy aims to find an optimal\narchitecture for different cells while the global search strategy is\nresponsible for optimizing the architecture of the target network. To minimize\nredundancy, transfer entropy is proposed to compute the information amount of\neach layer, thereby further simplifying the network search process.\nExperimental results on three public datasets demonstrate that the proposed\nEM-DARTS is capable of producing an optimal architecture that leads to\nstate-of-the-art recognition performance, {Specifically, the recognition models\ndeveloped using EM-DARTS achieved the lowest EERs of 0.0453 on the GazeBase\ndataset, 0.0377 on the JuDo1000 dataset, and 0.1385 on the EMglasses dataset.\n","authors":["Huafeng Qin","Hongyu Zhu","Xin Jin","Xin Yu","Mounim A. El-Yacoubi","Shuqiang Yang"],"pdf_url":"https://arxiv.org/pdf/2409.14432v2.pdf","comment":"Submited to IEEE Transactions on Instrumentation and Measurement"},{"id":"http://arxiv.org/abs/2407.19507v2","updated":"2025-01-13T08:58:40Z","published":"2024-07-28T14:58:07Z","title":"WeCromCL: Weakly Supervised Cross-Modality Contrastive Learning for\n Transcription-only Supervised Text Spotting","summary":" Transcription-only Supervised Text Spotting aims to learn text spotters\nrelying only on transcriptions but no text boundaries for supervision, thus\neliminating expensive boundary annotation. The crux of this task lies in\nlocating each transcription in scene text images without location annotations.\nIn this work, we formulate this challenging problem as a Weakly Supervised\nCross-modality Contrastive Learning problem, and design a simple yet effective\nmodel dubbed WeCromCL that is able to detect each transcription in a scene\nimage in a weakly supervised manner. Unlike typical methods for cross-modality\ncontrastive learning that focus on modeling the holistic semantic correlation\nbetween an entire image and a text description, our WeCromCL conducts atomistic\ncontrastive learning to model the character-wise appearance consistency between\na text transcription and its correlated region in a scene image to detect an\nanchor point for the transcription in a weakly supervised manner. The detected\nanchor points by WeCromCL are further used as pseudo location labels to guide\nthe learning of text spotting. Extensive experiments on four challenging\nbenchmarks demonstrate the superior performance of our model over other\nmethods. Code will be released.\n","authors":["Jingjing Wu","Zhengyao Fang","Pengyuan Lyu","Chengquan Zhang","Fanglin Chen","Guangming Lu","Wenjie Pei"],"pdf_url":"https://arxiv.org/pdf/2407.19507v2.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2501.05826v2","updated":"2025-01-13T08:56:05Z","published":"2025-01-10T10:03:56Z","title":"AI-Driven Diabetic Retinopathy Screening: Multicentric Validation of\n AIDRSS in India","summary":" Purpose: Diabetic retinopathy (DR) is a major cause of vision loss,\nparticularly in India, where access to retina specialists is limited in rural\nareas. This study aims to evaluate the Artificial Intelligence-based Diabetic\nRetinopathy Screening System (AIDRSS) for DR detection and prevalence\nassessment, addressing the growing need for scalable, automated screening\nsolutions in resource-limited settings.\n Approach: A multicentric, cross-sectional study was conducted in Kolkata,\nIndia, involving 5,029 participants and 10,058 macula-centric retinal fundus\nimages. The AIDRSS employed a deep learning algorithm with 50 million trainable\nparameters, integrated with Contrast Limited Adaptive Histogram Equalization\n(CLAHE) preprocessing for enhanced image quality. DR was graded using the\nInternational Clinical Diabetic Retinopathy (ICDR) Scale, categorizing disease\ninto five stages (DR0 to DR4). Statistical metrics including sensitivity,\nspecificity, and prevalence rates were evaluated against expert retina\nspecialist assessments.\n Results: The prevalence of DR in the general population was 13.7%, rising to\n38.2% among individuals with elevated random blood glucose levels. The AIDRSS\nachieved an overall sensitivity of 92%, specificity of 88%, and 100%\nsensitivity for detecting referable DR (DR3 and DR4). These results demonstrate\nthe system's robust performance in accurately identifying and grading DR in a\ndiverse population.\n Conclusions: AIDRSS provides a reliable, scalable solution for early DR\ndetection in resource-constrained environments. Its integration of advanced AI\ntechniques ensures high diagnostic accuracy, with potential to significantly\nreduce the burden of diabetes-related vision loss in underserved regions.\n","authors":["Amit Kr Dey","Pradeep Walia","Girish Somvanshi","Abrar Ali","Sagarnil Das","Pallabi Paul","Minakhi Ghosh"],"pdf_url":"https://arxiv.org/pdf/2501.05826v2.pdf","comment":"22 pages, 5 figures. arXiv admin note: substantial text overlap with\n arXiv:1812.07105 by other authors without attribution"},{"id":"http://arxiv.org/abs/2501.07133v1","updated":"2025-01-13T08:44:35Z","published":"2025-01-13T08:44:35Z","title":"Robust Single Object Tracking in LiDAR Point Clouds under Adverse\n Weather Conditions","summary":" 3D single object tracking (3DSOT) in LiDAR point clouds is a critical task\nfor outdoor perception, enabling real-time perception of object location,\norientation, and motion. Despite the impressive performance of current 3DSOT\nmethods, evaluating them on clean datasets inadequately reflects their\ncomprehensive performance, as the adverse weather conditions in real-world\nsurroundings has not been considered. One of the main obstacles is the lack of\nadverse weather benchmarks for the evaluation of 3DSOT. To this end, this work\nproposes a challenging benchmark for LiDAR-based 3DSOT in adverse weather,\nwhich comprises two synthetic datasets (KITTI-A and nuScenes-A) and one\nreal-world dataset (CADC-SOT) spanning three weather types: rain, fog, and\nsnow. Based on this benchmark, five representative 3D trackers from different\ntracking frameworks conducted robustness evaluation, resulting in significant\nperformance degradations. This prompts the question: What are the factors that\ncause current advanced methods to fail on such adverse weather samples?\nConsequently, we explore the impacts of adverse weather and answer the above\nquestion from three perspectives: 1) target distance; 2) template shape\ncorruption; and 3) target shape corruption. Finally, based on domain\nrandomization and contrastive learning, we designed a dual-branch tracking\nframework for adverse weather, named DRCT, achieving excellent performance in\nbenchmarks.\n","authors":["Xiantong Zhao","Xiuping Liu","Shengjing Tian","Yinan Han"],"pdf_url":"https://arxiv.org/pdf/2501.07133v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2408.06019v2","updated":"2025-01-13T08:42:11Z","published":"2024-08-12T09:19:38Z","title":"HeadGAP: Few-Shot 3D Head Avatar via Generalizable Gaussian Priors","summary":" In this paper, we present a novel 3D head avatar creation approach capable of\ngeneralizing from few-shot in-the-wild data with high-fidelity and animatable\nrobustness. Given the underconstrained nature of this problem, incorporating\nprior knowledge is essential. Therefore, we propose a framework comprising\nprior learning and avatar creation phases. The prior learning phase leverages\n3D head priors derived from a large-scale multi-view dynamic dataset, and the\navatar creation phase applies these priors for few-shot personalization. Our\napproach effectively captures these priors by utilizing a Gaussian\nSplatting-based auto-decoder network with part-based dynamic modeling. Our\nmethod employs identity-shared encoding with personalized latent codes for\nindividual identities to learn the attributes of Gaussian primitives. During\nthe avatar creation phase, we achieve fast head avatar personalization by\nleveraging inversion and fine-tuning strategies. Extensive experiments\ndemonstrate that our model effectively exploits head priors and successfully\ngeneralizes them to few-shot personalization, achieving photo-realistic\nrendering quality, multi-view consistency, and stable animation.\n","authors":["Xiaozheng Zheng","Chao Wen","Zhaohu Li","Weiyi Zhang","Zhuo Su","Xu Chang","Yang Zhao","Zheng Lv","Xiaoyuan Zhang","Yongjie Zhang","Guidong Wang","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2408.06019v2.pdf","comment":"Accepted to 3DV 2025. Project page: https://headgap.github.io/"},{"id":"http://arxiv.org/abs/2501.07120v1","updated":"2025-01-13T08:22:10Z","published":"2025-01-13T08:22:10Z","title":"MSV-Mamba: A Multiscale Vision Mamba Network for Echocardiography\n Segmentation","summary":" Ultrasound imaging frequently encounters challenges, such as those related to\nelevated noise levels, diminished spatiotemporal resolution, and the complexity\nof anatomical structures. These factors significantly hinder the model's\nability to accurately capture and analyze structural relationships and dynamic\npatterns across various regions of the heart. Mamba, an emerging model, is one\nof the most cutting-edge approaches that is widely applied to diverse vision\nand language tasks. To this end, this paper introduces a U-shaped deep learning\nmodel incorporating a large-window Mamba scale (LMS) module and a hierarchical\nfeature fusion approach for echocardiographic segmentation. First, a cascaded\nresidual block serves as an encoder and is employed to incrementally extract\nmultiscale detailed features. Second, a large-window multiscale mamba module is\nintegrated into the decoder to capture global dependencies across regions and\nenhance the segmentation capability for complex anatomical structures.\nFurthermore, our model introduces auxiliary losses at each decoder layer and\nemploys a dual attention mechanism to fuse multilayer features both spatially\nand across channels. This approach enhances segmentation performance and\naccuracy in delineating complex anatomical structures. Finally, the\nexperimental results using the EchoNet-Dynamic and CAMUS datasets demonstrate\nthat the model outperforms other methods in terms of both accuracy and\nrobustness. For the segmentation of the left ventricular endocardium\n(${LV}_{endo}$), the model achieved optimal values of 95.01 and 93.36,\nrespectively, while for the left ventricular epicardium (${LV}_{epi}$), values\nof 87.35 and 87.80, respectively, were achieved. This represents an improvement\nranging between 0.54 and 1.11 compared with the best-performing model.\n","authors":["Xiaoxian Yang","Qi Wang","Kaiqi Zhang","Ke Wei","Jun Lyu","Lingchao Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04746v3","updated":"2025-01-13T08:08:28Z","published":"2023-12-07T23:16:37Z","title":"Quilt-LLaVA: Visual Instruction Tuning by Extracting Localized\n Narratives from Open-Source Histopathology Videos","summary":" Diagnosis in histopathology requires a global whole slide images (WSIs)\nanalysis, requiring pathologists to compound evidence from different WSI\npatches. The gigapixel scale of WSIs poses a challenge for histopathology\nmulti-modal models. Training multi-model models for histopathology requires\ninstruction tuning datasets, which currently contain information for individual\nimage patches, without a spatial grounding of the concepts within each patch\nand without a wider view of the WSI. Therefore, they lack sufficient diagnostic\ncapacity for histopathology. To bridge this gap, we introduce Quilt-Instruct, a\nlarge-scale dataset of 107,131 histopathology-specific instruction\nquestion/answer pairs, grounded within diagnostically relevant image patches\nthat make up the WSI. Our dataset is collected by leveraging educational\nhistopathology videos from YouTube, which provides spatial localization of\nnarrations by automatically extracting the narrators' cursor positions.\nQuilt-Instruct supports contextual reasoning by extracting diagnosis and\nsupporting facts from the entire WSI. Using Quilt-Instruct, we train\nQuilt-LLaVA, which can reason beyond the given single image patch, enabling\ndiagnostic reasoning across patches. To evaluate Quilt-LLaVA, we propose a\ncomprehensive evaluation dataset created from 985 images and 1283\nhuman-generated question-answers. We also thoroughly evaluate Quilt-LLaVA using\npublic histopathology datasets, where Quilt-LLaVA significantly outperforms\nSOTA by over 10% on relative GPT-4 score and 4% and 9% on open and closed set\nVQA. Our code, data, and model are publicly accessible at\nquilt-llava.github.io.\n","authors":["Mehmet Saygin Seyfioglu","Wisdom O. Ikezogwo","Fatemeh Ghezloo","Ranjay Krishna","Linda Shapiro"],"pdf_url":"https://arxiv.org/pdf/2312.04746v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07114v1","updated":"2025-01-13T08:04:32Z","published":"2025-01-13T08:04:32Z","title":"Duplex: Dual Prototype Learning for Compositional Zero-Shot Learning","summary":" Compositional Zero-Shot Learning (CZSL) aims to enable models to recognize\nnovel compositions of visual states and objects that were absent during\ntraining. Existing methods predominantly focus on learning semantic\nrepresentations of seen compositions but often fail to disentangle the\nindependent features of states and objects in images, thereby limiting their\nability to generalize to unseen compositions. To address this challenge, we\npropose Duplex, a novel dual-prototype learning method that integrates semantic\nand visual prototypes through a carefully designed dual-branch architecture,\nenabling effective representation learning for compositional tasks. Duplex\nutilizes a Graph Neural Network (GNN) to adaptively update visual prototypes,\ncapturing complex interactions between states and objects. Additionally, it\nleverages the strong visual-semantic alignment of pre-trained Vision-Language\nModels (VLMs) and employs a multi-path architecture combined with prompt\nengineering to align image and text representations, ensuring robust\ngeneralization. Extensive experiments on three benchmark datasets demonstrate\nthat Duplex outperforms state-of-the-art methods in both closed-world and\nopen-world settings.\n","authors":["Zhong Peng","Yishi Xu","Gerong Wang","Wenchao Chen","Bo Chen","Jing Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07114v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07113v1","updated":"2025-01-13T08:03:49Z","published":"2025-01-13T08:03:49Z","title":"Matching Free Depth Recovery from Structured Light","summary":" We present a novel approach for depth estimation from images captured by\nstructured light systems. Unlike many previous methods that rely on image\nmatching process, our approach uses a density voxel grid to represent scene\ngeometry, which is trained via self-supervised differentiable volume rendering.\nOur method leverages color fields derived from projected patterns in structured\nlight systems during the rendering process, enabling the isolated optimization\nof the geometry field. This contributes to faster convergence and high-quality\noutput. Additionally, we incorporate normalized device coordinates (NDC), a\ndistortion loss, and a novel surface-based color loss to enhance geometric\nfidelity. Experimental results demonstrate that our method outperforms existing\nmatching-based techniques in geometric performance for few-shot scenarios,\nachieving approximately a 60% reduction in average estimated depth errors on\nsynthetic scenes and about 30% on real-world captured scenes. Furthermore, our\napproach delivers fast training, with a speed roughly three times faster than\nprevious matching-free methods that employ implicit representations.\n","authors":["Zhuohang Yu","Kai Wang","Juyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07113v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.07110v1","updated":"2025-01-13T07:51:43Z","published":"2025-01-13T07:51:43Z","title":"Dynamic Multimodal Fusion via Meta-Learning Towards Micro-Video\n Recommendation","summary":" Multimodal information (e.g., visual, acoustic, and textual) has been widely\nused to enhance representation learning for micro-video recommendation. For\nintegrating multimodal information into a joint representation of micro-video,\nmultimodal fusion plays a vital role in the existing micro-video recommendation\napproaches. However, the static multimodal fusion used in previous studies is\ninsufficient to model the various relationships among multimodal information of\ndifferent micro-videos. In this paper, we develop a novel meta-learning-based\nmultimodal fusion framework called Meta Multimodal Fusion (MetaMMF), which\ndynamically assigns parameters to the multimodal fusion function for each\nmicro-video during its representation learning. Specifically, MetaMMF regards\nthe multimodal fusion of each micro-video as an independent task. Based on the\nmeta information extracted from the multimodal features of the input task,\nMetaMMF parameterizes a neural network as the item-specific fusion function via\na meta learner. We perform extensive experiments on three benchmark datasets,\ndemonstrating the significant improvements over several state-of-the-art\nmultimodal recommendation models, like MMGCN, LATTICE, and InvRL. Furthermore,\nwe lighten our model by adopting canonical polyadic decomposition to improve\nthe training efficiency, and validate its effectiveness through experimental\nresults. Codes are available at https://github.com/hanliu95/MetaMMF.\n","authors":["Han Liu","Yinwei Wei","Fan Liu","Wenjie Wang","Liqiang Nie","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2501.07110v1.pdf","comment":"This paper has been accepted by ACM Transactions on Information\n Systems"},{"id":"http://arxiv.org/abs/2501.07109v1","updated":"2025-01-13T07:43:33Z","published":"2025-01-13T07:43:33Z","title":"The Quest for Visual Understanding: A Journey Through the Evolution of\n Visual Question Answering","summary":" Visual Question Answering (VQA) is an interdisciplinary field that bridges\nthe gap between computer vision (CV) and natural language processing(NLP),\nenabling Artificial Intelligence(AI) systems to answer questions about images.\nSince its inception in 2015, VQA has rapidly evolved, driven by advances in\ndeep learning, attention mechanisms, and transformer-based models. This survey\ntraces the journey of VQA from its early days, through major breakthroughs,\nsuch as attention mechanisms, compositional reasoning, and the rise of\nvision-language pre-training methods. We highlight key models, datasets, and\ntechniques that shaped the development of VQA systems, emphasizing the pivotal\nrole of transformer architectures and multimodal pre-training in driving recent\nprogress. Additionally, we explore specialized applications of VQA in domains\nlike healthcare and discuss ongoing challenges, such as dataset bias, model\ninterpretability, and the need for common-sense reasoning. Lastly, we discuss\nthe emerging trends in large multimodal language models and the integration of\nexternal knowledge, offering insights into the future directions of VQA. This\npaper aims to provide a comprehensive overview of the evolution of VQA,\nhighlighting both its current state and potential advancements.\n","authors":["Anupam Pandey","Deepjyoti Bodo","Arpan Phukan","Asif Ekbal"],"pdf_url":"https://arxiv.org/pdf/2501.07109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07104v1","updated":"2025-01-13T07:32:44Z","published":"2025-01-13T07:32:44Z","title":"RMAvatar: Photorealistic Human Avatar Reconstruction from Monocular\n Video Based on Rectified Mesh-embedded Gaussians","summary":" We introduce RMAvatar, a novel human avatar representation with Gaussian\nsplatting embedded on mesh to learn clothed avatar from a monocular video. We\nutilize the explicit mesh geometry to represent motion and shape of a virtual\nhuman and implicit appearance rendering with Gaussian Splatting. Our method\nconsists of two main modules: Gaussian initialization module and Gaussian\nrectification module. We embed Gaussians into triangular faces and control\ntheir motion through the mesh, which ensures low-frequency motion and surface\ndeformation of the avatar. Due to the limitations of LBS formula, the human\nskeleton is hard to control complex non-rigid transformations. We then design a\npose-related Gaussian rectification module to learn fine-detailed non-rigid\ndeformations, further improving the realism and expressiveness of the avatar.\nWe conduct extensive experiments on public datasets, RMAvatar shows\nstate-of-the-art performance on both rendering quality and quantitative\nevaluations. Please see our project page at https://rm-avatar.github.io.\n","authors":["Sen Peng","Weixing Xie","Zilong Wang","Xiaohu Guo","Zhonggui Chen","Baorong Yang","Xiao Dong"],"pdf_url":"https://arxiv.org/pdf/2501.07104v1.pdf","comment":"CVM2025"},{"id":"http://arxiv.org/abs/2411.14789v2","updated":"2025-01-13T07:29:53Z","published":"2024-11-22T08:17:46Z","title":"Simplifying CLIP: Unleashing the Power of Large-Scale Models on\n Consumer-level Computers","summary":" Contrastive Language-Image Pre-training (CLIP) has attracted a surge of\nattention for its superior zero-shot performance and excellent transferability\nto downstream tasks. However, training such large-scale models usually requires\nsubstantial computation and storage, which poses barriers for general users\nwith consumer-level computers. Motivated by this observation, in this paper we\ninvestigate how to achieve competitive performance on only one Nvidia RTX3090\nGPU and with one terabyte for storing dataset. On one hand, we simplify the\ntransformer block structure and combine Weight Inheritance with multi-stage\nKnowledge Distillation (WIKD), thereby reducing the parameters and improving\nthe inference speed during training along with deployment. On the other hand,\nconfronted with the convergence challenge posed by small dataset, we generate\nsynthetic captions for each sample as data augmentation, and devise a novel\nPair Matching (PM) loss to fully exploit the distinguishment among positive and\nnegative image-text pairs. Extensive experiments demonstrate that our model can\nachieve a new state-of-the-art datascale-parameter-accuracy tradeoff, which\ncould further popularize the CLIP model in the related research community.\n","authors":["Hongbo Liu"],"pdf_url":"https://arxiv.org/pdf/2411.14789v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07101v1","updated":"2025-01-13T07:26:37Z","published":"2025-01-13T07:26:37Z","title":"Dual Scale-aware Adaptive Masked Knowledge Distillation for Object\n Detection","summary":" Recent feature masking knowledge distillation methods make use of attention\nmechanisms to identify either important spatial regions or channel clues for\ndiscriminative feature reconstruction. However, most of existing strategies\nperform global attention-guided feature masking distillation without delving\ninto fine-grained visual clues in feature maps. In particular, uncovering\nlocality-aware clues across different scales are conducive to reconstructing\nregion-aware features, thereby significantly benefiting distillation\nperformance. In this study, we propose a fine-grained adaptive feature masking\ndistillation framework for accurate object detection. Different from previous\nmethods in which global masking is performed on single-scale feature maps, we\nexplore the scale-aware feature masking by performing feature distillation\nacross various scales, such that the object-aware locality is encoded for\nimproved feature reconstruction. In addition, our fine-grained feature\ndistillation strategy is combined with a masking logits distillation scheme in\nwhich logits difference between teacher and student networks is utilized to\nguide the distillation process. Thus, it can help the student model to better\nlearn from the teacher counterpart with improved knowledge transfer. Extensive\nexperiments for detection task demonstrate the superiority of our method. For\nexample, when RetinaNet, RepPoints and Cascade Mask RCNN are used as teacher\ndetectors, the student network achieves mAP scores of 41.5\\%, 42.9\\%, and\n42.6\\%, respectively, outperforming state-of-the-art methods such as DMKD and\nFreeKD.\n","authors":["ZhouRui Zhang","Jun Li","JiaYan Li","ZhiJian Wu","JianHua Xu"],"pdf_url":"https://arxiv.org/pdf/2501.07101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07100v1","updated":"2025-01-13T07:26:05Z","published":"2025-01-13T07:26:05Z","title":"Collaborative Learning for 3D Hand-Object Reconstruction and\n Compositional Action Recognition from Egocentric RGB Videos Using\n Superquadrics","summary":" With the availability of egocentric 3D hand-object interaction datasets,\nthere is increasing interest in developing unified models for hand-object pose\nestimation and action recognition. However, existing methods still struggle to\nrecognise seen actions on unseen objects due to the limitations in representing\nobject shape and movement using 3D bounding boxes. Additionally, the reliance\non object templates at test time limits their generalisability to unseen\nobjects. To address these challenges, we propose to leverage superquadrics as\nan alternative 3D object representation to bounding boxes and demonstrate their\neffectiveness on both template-free object reconstruction and action\nrecognition tasks. Moreover, as we find that pure appearance-based methods can\noutperform the unified methods, the potential benefits from 3D geometric\ninformation remain unclear. Therefore, we study the compositionality of actions\nby considering a more challenging task where the training combinations of verbs\nand nouns do not overlap with the testing split. We extend H2O and FPHA\ndatasets with compositional splits and design a novel collaborative learning\nframework that can explicitly reason about the geometric relations between\nhands and the manipulated object. Through extensive quantitative and\nqualitative evaluations, we demonstrate significant improvements over the\nstate-of-the-arts in (compositional) action recognition.\n","authors":["Tze Ho Elden Tse","Runyang Feng","Linfang Zheng","Jiho Park","Yixing Gao","Jihie Kim","Ales Leonardis","Hyung Jin Chang"],"pdf_url":"https://arxiv.org/pdf/2501.07100v1.pdf","comment":"Accepted to AAAI 2025"},{"id":"http://arxiv.org/abs/2412.07249v2","updated":"2025-01-13T07:22:02Z","published":"2024-12-10T07:18:51Z","title":"Buster: Implanting Semantic Backdoor into Text Encoder to Mitigate NSFW\n Content Generation","summary":" The rise of deep learning models in the digital era has raised substantial\nconcerns regarding the generation of Not-Safe-for-Work (NSFW) content. Existing\ndefense methods primarily involve model fine-tuning and post-hoc content\nmoderation. Nevertheless, these approaches largely lack scalability in\neliminating harmful content, degrade the quality of benign image generation, or\nincur high inference costs. To address these challenges, we propose an\ninnovative framework named \\textit{Buster}, which injects backdoors into the\ntext encoder to prevent NSFW content generation. Buster leverages deep semantic\ninformation rather than explicit prompts as triggers, redirecting NSFW prompts\ntowards targeted benign prompts. Additionally, Buster employs energy-based\ntraining data generation through Langevin dynamics for adversarial knowledge\naugmentation, thereby ensuring robustness in harmful concept definition. This\napproach demonstrates exceptional resilience and scalability in mitigating NSFW\ncontent. Particularly, Buster fine-tunes the text encoder of Text-to-Image\nmodels within merely five minutes, showcasing its efficiency. Our extensive\nexperiments denote that Buster outperforms nine state-of-the-art baselines,\nachieving a superior NSFW content removal rate of at least 91.2\\% while\npreserving the quality of harmless images.\n","authors":["Xin Zhao","Xiaojun Chen","Yuexin Xuan","Zhendong Zhao","Xiaojun Jia","Xinfeng Li","Xiaofeng Wang"],"pdf_url":"https://arxiv.org/pdf/2412.07249v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15378v2","updated":"2025-01-13T06:49:22Z","published":"2023-08-29T15:16:51Z","title":"On the Robustness of Object Detection Models on Aerial Images","summary":" The robustness of object detection models is a major concern when applied to\nreal-world scenarios. The performance of most models tends to degrade when\nconfronted with images affected by corruptions, since they are usually trained\nand evaluated on clean datasets. While numerous studies have explored the\nrobustness of object detection models on natural images, there is a paucity of\nresearch focused on models applied to aerial images, which feature complex\nbackgrounds, substantial variations in scales, and orientations of objects.\nThis paper addresses the challenge of assessing the robustness of object\ndetection models on aerial images, with a specific emphasis on scenarios where\nimages are affected by clouds. In this study, we introduce two novel benchmarks\nbased on DOTA-v1.0. The first benchmark encompasses 19 prevalent corruptions,\nwhile the second focuses on the cloud-corrupted condition-a phenomenon uncommon\nin natural images yet frequent in aerial photography. We systematically\nevaluate the robustness of mainstream object detection models and perform\nnecessary ablation experiments. Through our investigations, we find that\nrotation-invariant modeling and enhanced backbone architectures can improve the\nrobustness of models. Furthermore, increasing the capacity of Transformer-based\nbackbones can strengthen their robustness. The benchmarks we propose and our\ncomprehensive experimental analyses can facilitate research on robust object\ndetection on aerial images. The codes and datasets are available at:\nhttps://github.com/hehaodong530/DOTA-C.\n","authors":["Haodong He","Jian Ding","Bowen Xu","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2308.15378v2.pdf","comment":"accepted by IEEE TGRS"},{"id":"http://arxiv.org/abs/2501.07087v1","updated":"2025-01-13T06:45:32Z","published":"2025-01-13T06:45:32Z","title":"Video Quality Assessment for Online Processing: From Spatial to Temporal\n Sampling","summary":" With the rapid development of multimedia processing and deep learning\ntechnologies, especially in the field of video understanding, video quality\nassessment (VQA) has achieved significant progress. Although researchers have\nmoved from designing efficient video quality mapping models to various research\ndirections, in-depth exploration of the effectiveness-efficiency trade-offs of\nspatio-temporal modeling in VQA models is still less sufficient. Considering\nthe fact that videos have highly redundant information, this paper investigates\nthis problem from the perspective of joint spatial and temporal sampling,\naiming to seek the answer to how little information we should keep at least\nwhen feeding videos into the VQA models while with acceptable performance\nsacrifice. To this end, we drastically sample the video's information from both\nspatial and temporal dimensions, and the heavily squeezed video is then fed\ninto a stable VQA model. Comprehensive experiments regarding joint spatial and\ntemporal sampling are conducted on six public video quality databases, and the\nresults demonstrate the acceptable performance of the VQA model when throwing\naway most of the video information. Furthermore, with the proposed joint\nspatial and temporal sampling strategy, we make an initial attempt to design an\nonline VQA model, which is instantiated by as simple as possible a spatial\nfeature extractor, a temporal feature fusion module, and a global quality\nregression module. Through quantitative and qualitative experiments, we verify\nthe feasibility of online VQA model by simplifying itself and reducing input.\n","authors":["Jiebin Yan","Lei Wu","Yuming Fang","Xuelin Liu","Xue Xia","Weide Liu"],"pdf_url":"https://arxiv.org/pdf/2501.07087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07076v1","updated":"2025-01-13T06:13:25Z","published":"2025-01-13T06:13:25Z","title":"Representation Learning of Point Cloud Upsampling in Global and Local\n Inputs","summary":" In recent years, point cloud upsampling has been widely applied in fields\nsuch as 3D reconstruction. Our study investigates the factors influencing point\ncloud upsampling on both global and local levels through representation\nlearning. Specifically, the paper inputs global and local information of the\nsame point cloud model object into two encoders to extract these features,\nfuses them, and then feeds the combined features into an upsampling decoder.\nThe goal is to address issues of sparsity and noise in point clouds by\nleveraging prior knowledge from both global and local inputs. And the proposed\nframework can be applied to any state-of-the-art point cloud upsampling neural\nnetwork. Experiments were conducted on a series of autoencoder-based models\nutilizing deep learning, yielding interpretability for both global and local\ninputs, and it has been proven in the results that our proposed framework can\nfurther improve the upsampling effect in previous SOTA works. At the same time,\nthe Saliency Map reflects the differences between global and local feature\ninputs, as well as the effectiveness of training with both inputs in parallel.\n","authors":["Tongxu Zhang","Bei Wang"],"pdf_url":"https://arxiv.org/pdf/2501.07076v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07072v1","updated":"2025-01-13T05:57:09Z","published":"2025-01-13T05:57:09Z","title":"Label Calibration in Source Free Domain Adaptation","summary":" Source-free domain adaptation (SFDA) utilizes a pre-trained source model with\nunlabeled target data. Self-supervised SFDA techniques generate pseudolabels\nfrom the pre-trained source model, but these pseudolabels often contain noise\ndue to domain discrepancies between the source and target domains. Traditional\nself-supervised SFDA techniques rely on deterministic model predictions using\nthe softmax function, leading to unreliable pseudolabels. In this work, we\npropose to introduce predictive uncertainty and softmax calibration for\npseudolabel refinement using evidential deep learning. The Dirichlet prior is\nplaced over the output of the target network to capture uncertainty using\nevidence with a single forward pass. Furthermore, softmax calibration solves\nthe translation invariance problem to assist in learning with noisy labels. We\nincorporate a combination of evidential deep learning loss and information\nmaximization loss with calibrated softmax in both prior and non-prior target\nknowledge SFDA settings. Extensive experimental analysis shows that our method\noutperforms other state-of-the-art methods on benchmark datasets.\n","authors":["Shivangi Rai","Rini Smita Thakur","Kunal Jangid","Vinod K Kurmi"],"pdf_url":"https://arxiv.org/pdf/2501.07072v1.pdf","comment":"Accepted in IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2501.07070v1","updated":"2025-01-13T05:48:32Z","published":"2025-01-13T05:48:32Z","title":"Enhancing Image Generation Fidelity via Progressive Prompts","summary":" The diffusion transformer (DiT) architecture has attracted significant\nattention in image generation, achieving better fidelity, performance, and\ndiversity. However, most existing DiT - based image generation methods focus on\nglobal - aware synthesis, and regional prompt control has been less explored.\nIn this paper, we propose a coarse - to - fine generation pipeline for regional\nprompt - following generation. Specifically, we first utilize the powerful\nlarge language model (LLM) to generate both high - level descriptions of the\nimage (such as content, topic, and objects) and low - level descriptions (such\nas details and style). Then, we explore the influence of cross - attention\nlayers at different depths. We find that deeper layers are always responsible\nfor high - level content control, while shallow layers handle low - level\ncontent control. Various prompts are injected into the proposed regional cross\n- attention control for coarse - to - fine generation. By using the proposed\npipeline, we enhance the controllability of DiT - based image generation.\nExtensive quantitative and qualitative results show that our pipeline can\nimprove the performance of the generated images.\n","authors":["Zhen Xiong","Yuqi Li","Chuanguang Yang","Tiao Tan","Zhihong Zhu","Siyuan Li","Yue Ma"],"pdf_url":"https://arxiv.org/pdf/2501.07070v1.pdf","comment":"Accepted by ICASSP 2025, Github:\n https://github.com/ZhenXiong-dl/ICASSP2025-RCAC"},{"id":"http://arxiv.org/abs/2501.07069v1","updated":"2025-01-13T05:39:43Z","published":"2025-01-13T05:39:43Z","title":"Hierarchical Superpixel Segmentation via Structural Information Theory","summary":" Superpixel segmentation is a foundation for many higher-level computer vision\ntasks, such as image segmentation, object recognition, and scene understanding.\nExisting graph-based superpixel segmentation methods typically concentrate on\nthe relationships between a given pixel and its directly adjacent pixels while\noverlooking the influence of non-adjacent pixels. These approaches do not fully\nleverage the global information in the graph, leading to suboptimal\nsegmentation quality. To address this limitation, we present SIT-HSS, a\nhierarchical superpixel segmentation method based on structural information\ntheory. Specifically, we first design a novel graph construction strategy that\nincrementally explores the pixel neighborhood to add edges based on\n1-dimensional structural entropy (1D SE). This strategy maximizes the retention\nof graph information while avoiding an overly complex graph structure. Then, we\ndesign a new 2D SE-guided hierarchical graph partitioning method, which\niteratively merges pixel clusters layer by layer to reduce the graph's 2D SE\nuntil a predefined segmentation scale is achieved. Experimental results on\nthree benchmark datasets demonstrate that the SIT-HSS performs better than\nstate-of-the-art unsupervised superpixel segmentation algorithms. The source\ncode is available at \\url{https://github.com/SELGroup/SIT-HSS}.\n","authors":["Minhui Xie","Hao Peng","Pu Li","Guangjie Zeng","Shuhai Wang","Jia Wu","Peng Li","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2501.07069v1.pdf","comment":"Accepted by SDM 2025"},{"id":"http://arxiv.org/abs/2406.17442v3","updated":"2025-01-13T05:36:58Z","published":"2024-06-25T10:23:53Z","title":"Pamba: Enhancing Global Interaction in Point Clouds via State Space\n Model","summary":" Transformers have demonstrated impressive results for 3D point cloud semantic\nsegmentation. However, the quadratic complexity of transformer makes\ncomputation costs high, limiting the number of points that can be processed\nsimultaneously and impeding the modeling of long-range dependencies between\nobjects in a single scene. Drawing inspiration from the great potential of\nrecent state space models (SSM) for long sequence modeling, we introduce Mamba,\nan SSM-based architecture, to the point cloud domain and propose Pamba, a novel\narchitecture with strong global modeling capability under linear complexity.\nSpecifically, to make the disorderness of point clouds fit in with the causal\nnature of Mamba, we propose a multi-path serialization strategy applicable to\npoint clouds. Besides, we propose the ConvMamba block to compensate for the\nshortcomings of Mamba in modeling local geometries and in unidirectional\nmodeling. Pamba obtains state-of-the-art results on several 3D point cloud\nsegmentation tasks, including ScanNet v2, ScanNet200, S3DIS and nuScenes, while\nits effectiveness is validated by extensive experiments.\n","authors":["Zhuoyuan Li","Yubo Ai","Jiahao Lu","ChuXin Wang","Jiacheng Deng","Hanzhi Chang","Yanzhe Liang","Wenfei Yang","Shifeng Zhang","Tianzhu Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.17442v3.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2410.20974v2","updated":"2025-01-13T05:06:17Z","published":"2024-10-28T12:46:05Z","title":"MovieCharacter: A Tuning-Free Framework for Controllable Character Video\n Synthesis","summary":" Recent advancements in character video synthesis still depend on extensive\nfine-tuning or complex 3D modeling processes, which can restrict accessibility\nand hinder real-time applicability. To address these challenges, we propose a\nsimple yet effective tuning-free framework for character video synthesis, named\nMovieCharacter, designed to streamline the synthesis process while ensuring\nhigh-quality outcomes. Our framework decomposes the synthesis task into\ndistinct, manageable modules: character segmentation and tracking, video object\nremoval, character motion imitation, and video composition. This modular design\nnot only facilitates flexible customization but also ensures that each\ncomponent operates collaboratively to effectively meet user needs. By\nleveraging existing open-source models and integrating well-established\ntechniques, MovieCharacter achieves impressive synthesis results without\nnecessitating substantial resources or proprietary datasets. Experimental\nresults demonstrate that our framework enhances the efficiency, accessibility,\nand adaptability of character video synthesis, paving the way for broader\ncreative and interactive applications.\n","authors":["Di Qiu","Zheng Chen","Rui Wang","Mingyuan Fan","Changqian Yu","Junshi Huang","Xiang Wen"],"pdf_url":"https://arxiv.org/pdf/2410.20974v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16837v2","updated":"2025-01-13T05:04:59Z","published":"2024-07-23T21:02:38Z","title":"MLLM-CompBench: A Comparative Reasoning Benchmark for Multimodal LLMs","summary":" The ability to compare objects, scenes, or situations is crucial for\neffective decision-making and problem-solving in everyday life. For instance,\ncomparing the freshness of apples enables better choices during grocery\nshopping while comparing sofa designs helps optimize the aesthetics of our\nliving space. Despite its significance, the comparative capability is largely\nunexplored in artificial general intelligence (AGI). In this paper, we\nintroduce MLLM-CompBench, a benchmark designed to evaluate the comparative\nreasoning capability of multimodal large language models (MLLMs).\nMLLM-CompBench mines and pairs images through visually oriented questions\ncovering eight dimensions of relative comparison: visual attribute, existence,\nstate, emotion, temporality, spatiality, quantity, and quality. We curate a\ncollection of around 40K image pairs using metadata from diverse vision\ndatasets and CLIP similarity scores. These image pairs span a broad array of\nvisual domains, including animals, fashion, sports, and both outdoor and indoor\nscenes. The questions are carefully crafted to discern relative characteristics\nbetween two images and are labeled by human annotators for accuracy and\nrelevance. We use MLLM-CompBench to evaluate recent MLLMs, including\nGPT-4V(ision), Gemini-Pro, and LLaVA-1.6. Our results reveal notable\nshortcomings in their comparative abilities. We believe MLLM-COMPBENCH not only\nsheds light on these limitations but also establishes a solid foundation for\nfuture enhancements in the comparative capability of MLLMs.\n","authors":["Jihyung Kil","Zheda Mai","Justin Lee","Zihe Wang","Kerrie Cheng","Lemeng Wang","Ye Liu","Arpita Chowdhury","Wei-Lun Chao"],"pdf_url":"https://arxiv.org/pdf/2407.16837v2.pdf","comment":"This paper has been accepted to NeurIPS 2024. The first two authors\n contributed equally to this work"},{"id":"http://arxiv.org/abs/2501.07055v1","updated":"2025-01-13T04:30:41Z","published":"2025-01-13T04:30:41Z","title":"SFC-GAN: A Generative Adversarial Network for Brain Functional and\n Structural Connectome Translation","summary":" Modern brain imaging technologies have enabled the detailed reconstruction of\nhuman brain connectomes, capturing structural connectivity (SC) from diffusion\nMRI and functional connectivity (FC) from functional MRI. Understanding the\nintricate relationships between SC and FC is vital for gaining deeper insights\ninto the brain's functional and organizational mechanisms. However, obtaining\nboth SC and FC modalities simultaneously remains challenging, hindering\ncomprehensive analyses. Existing deep generative models typically focus on\nsynthesizing a single modality or unidirectional translation between FC and SC,\nthereby missing the potential benefits of bi-directional translation,\nespecially in scenarios where only one connectome is available. Therefore, we\npropose Structural-Functional Connectivity GAN (SFC-GAN), a novel framework for\nbidirectional translation between SC and FC. This approach leverages the\nCycleGAN architecture, incorporating convolutional layers to effectively\ncapture the spatial structures of brain connectomes. To preserve the\ntopological integrity of these connectomes, we employ a structure-preserving\nloss that guides the model in capturing both global and local connectome\npatterns while maintaining symmetry. Our framework demonstrates superior\nperformance in translating between SC and FC, outperforming baseline models in\nsimilarity and graph property evaluations compared to ground truth data, each\ntranslated modality can be effectively utilized for downstream classification.\n","authors":["Yee-Fan Tan","Jun Lin Liow","Pei-Sze Tan","Fuad Noman","Raphael C. -W. Phan","Hernando Ombao","Chee-Ming Ting"],"pdf_url":"https://arxiv.org/pdf/2501.07055v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.11477v3","updated":"2025-01-13T04:22:25Z","published":"2024-11-18T11:26:11Z","title":"SL-YOLO: A Stronger and Lighter Drone Target Detection Model","summary":" Detecting small objects in complex scenes, such as those captured by drones,\nis a daunting challenge due to the difficulty in capturing the complex features\nof small targets. While the YOLO family has achieved great success in large\ntarget detection, its performance is less than satisfactory when faced with\nsmall targets. Because of this, this paper proposes a revolutionary model\nSL-YOLO (Stronger and Lighter YOLO) that aims to break the bottleneck of small\ntarget detection. We propose the Hierarchical Extended Path Aggregation Network\n(HEPAN), a pioneering cross-scale feature fusion method that can ensure\nunparalleled detection accuracy even in the most challenging environments. At\nthe same time, without sacrificing detection capabilities, we design the C2fDCB\nlightweight module and add the SCDown downsampling module to greatly reduce the\nmodel's parameters and computational complexity. Our experimental results on\nthe VisDrone2019 dataset reveal a significant improvement in performance, with\nmAP@0.5 jumping from 43.0% to 46.9% and mAP@0.5:0.95 increasing from 26.0% to\n28.9%. At the same time, the model parameters are reduced from 11.1M to 9.6M,\nand the FPS can reach 132, making it an ideal solution for real-time small\nobject detection in resource-constrained environments.\n","authors":["Defan Chen","Luchan Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11477v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.20870v2","updated":"2025-01-13T04:11:06Z","published":"2024-12-30T11:16:49Z","title":"SoftPatch+: Fully Unsupervised Anomaly Classification and Segmentation","summary":" Although mainstream unsupervised anomaly detection (AD) (including\nimage-level classification and pixel-level segmentation)algorithms perform well\nin academic datasets, their performance is limited in practical application due\nto the ideal experimental setting of clean training data. Training with noisy\ndata is an inevitable problem in real-world anomaly detection but is seldom\ndiscussed. This paper is the first to consider fully unsupervised industrial\nanomaly detection (i.e., unsupervised AD with noisy data). To solve this\nproblem, we proposed memory-based unsupervised AD methods, SoftPatch and\nSoftPatch+, which efficiently denoise the data at the patch level. Noise\ndiscriminators are utilized to generate outlier scores for patch-level noise\nelimination before coreset construction. The scores are then stored in the\nmemory bank to soften the anomaly detection boundary. Compared with existing\nmethods, SoftPatch maintains a strong modeling ability of normal data and\nalleviates the overconfidence problem in coreset, and SoftPatch+ has more\nrobust performance which is articularly useful in real-world industrial\ninspection scenarios with high levels of noise (from 10% to 40%). Comprehensive\nexperiments conducted in diverse noise scenarios demonstrate that both\nSoftPatch and SoftPatch+ outperform the state-of-the-art AD methods on the\nMVTecAD, ViSA, and BTAD benchmarks. Furthermore, the performance of SoftPatch\nand SoftPatch+ is comparable to that of the noise-free methods in conventional\nunsupervised AD setting. The code of the proposed methods can be found at\nhttps://github.com/TencentYoutuResearch/AnomalyDetection-SoftPatch.\n","authors":["Chengjie Wang","Xi Jiang","Bin-Bin Gao","Zhenye Gan","Yong Liu","Feng Zheng","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2412.20870v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2403.14233\n paper has been accepted by Pattern Recognition"},{"id":"http://arxiv.org/abs/2501.07044v1","updated":"2025-01-13T03:54:19Z","published":"2025-01-13T03:54:19Z","title":"Protego: Detecting Adversarial Examples for Vision Transformers via\n Intrinsic Capabilities","summary":" Transformer models have excelled in natural language tasks, prompting the\nvision community to explore their implementation in computer vision problems.\nHowever, these models are still influenced by adversarial examples. In this\npaper, we investigate the attack capabilities of six common adversarial attacks\non three pretrained ViT models to reveal the vulnerability of ViT models. To\nunderstand and analyse the bias in neural network decisions when the input is\nadversarial, we use two visualisation techniques that are attention rollout and\ngrad attention rollout. To prevent ViT models from adversarial attack, we\npropose Protego, a detection framework that leverages the transformer intrinsic\ncapabilities to detection adversarial examples of ViT models. Nonetheless, this\nis challenging due to a diversity of attack strategies that may be adopted by\nadversaries. Inspired by the attention mechanism, we know that the token of\nprediction contains all the information from the input sample. Additionally,\nthe attention region for adversarial examples differs from that of normal\nexamples. Given these points, we can train a detector that achieves superior\nperformance than existing detection methods to identify adversarial examples.\nOur experiments have demonstrated the high effectiveness of our detection\nmethod. For these six adversarial attack methods, our detector's AUC scores all\nexceed 0.95. Protego may advance investigations in metaverse security.\n","authors":["Jialin Wu","Kaikai Pan","Yanjiao Chen","Jiangyi Deng","Shengyuan Pang","Wenyuan Xu"],"pdf_url":"https://arxiv.org/pdf/2501.07044v1.pdf","comment":"Accepted by IEEE MetaCom 2024"},{"id":"http://arxiv.org/abs/2501.07040v1","updated":"2025-01-13T03:43:21Z","published":"2025-01-13T03:43:21Z","title":"Rethinking Knowledge in Distillation: An In-context Sample Retrieval\n Perspective","summary":" Conventional knowledge distillation (KD) approaches are designed for the\nstudent model to predict similar output as the teacher model for each sample.\nUnfortunately, the relationship across samples with same class is often\nneglected. In this paper, we explore to redefine the knowledge in distillation,\ncapturing the relationship between each sample and its corresponding in-context\nsamples (a group of similar samples with the same or different classes), and\nperform KD from an in-context sample retrieval perspective. As KD is a type of\nlearned label smoothing regularization (LSR), we first conduct a theoretical\nanalysis showing that the teacher's knowledge from the in-context samples is a\ncrucial contributor to regularize the student training with the corresponding\nsamples. Buttressed by the analysis, we propose a novel in-context knowledge\ndistillation (IC-KD) framework that shows its superiority across diverse KD\nparadigms (offline, online, and teacher-free KD). Firstly, we construct a\nfeature memory bank from the teacher model and retrieve in-context samples for\neach corresponding sample through retrieval-based learning. We then introduce\nPositive In-Context Distillation (PICD) to reduce the discrepancy between a\nsample from the student and the aggregated in-context samples with the same\nclass from the teacher in the logit space. Moreover, Negative In-Context\nDistillation (NICD) is introduced to separate a sample from the student and the\nin-context samples with different classes from the teacher in the logit space.\nExtensive experiments demonstrate that IC-KD is effective across various types\nof KD, and consistently achieves state-of-the-art performance on CIFAR-100 and\nImageNet datasets.\n","authors":["Jinjing Zhu","Songze Li","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2501.07040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07039v1","updated":"2025-01-13T03:41:57Z","published":"2025-01-13T03:41:57Z","title":"IoT-Based Real-Time Medical-Related Human Activity Recognition Using\n Skeletons and Multi-Stage Deep Learning for Healthcare","summary":" The Internet of Things (IoT) and mobile technology have significantly\ntransformed healthcare by enabling real-time monitoring and diagnosis of\npatients. Recognizing medical-related human activities (MRHA) is pivotal for\nhealthcare systems, particularly for identifying actions that are critical to\npatient well-being. However, challenges such as high computational demands, low\naccuracy, and limited adaptability persist in Human Motion Recognition (HMR).\nWhile some studies have integrated HMR with IoT for real-time healthcare\napplications, limited research has focused on recognizing MRHA as essential for\neffective patient monitoring. This study proposes a novel HMR method for MRHA\ndetection, leveraging multi-stage deep learning techniques integrated with IoT.\nThe approach employs EfficientNet to extract optimized spatial features from\nskeleton frame sequences using seven Mobile Inverted Bottleneck Convolutions\n(MBConv) blocks, followed by ConvLSTM to capture spatio-temporal patterns. A\nclassification module with global average pooling, a fully connected layer, and\na dropout layer generates the final predictions. The model is evaluated on the\nNTU RGB+D 120 and HMDB51 datasets, focusing on MRHA, such as sneezing, falling,\nwalking, sitting, etc. It achieves 94.85% accuracy for cross-subject\nevaluations and 96.45% for cross-view evaluations on NTU RGB+D 120, along with\n89.00% accuracy on HMDB51. Additionally, the system integrates IoT capabilities\nusing a Raspberry Pi and GSM module, delivering real-time alerts via Twilios\nSMS service to caregivers and patients. This scalable and efficient solution\nbridges the gap between HMR and IoT, advancing patient monitoring, improving\nhealthcare outcomes, and reducing costs.\n","authors":["Subrata Kumer Paul","Abu Saleh Musa Miah","Rakhi Rani Paul","Md. Ekramul Hamid","Jungpil Shin","Md Abdur Rahim"],"pdf_url":"https://arxiv.org/pdf/2501.07039v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04184v2","updated":"2025-01-13T03:33:36Z","published":"2025-01-07T23:32:05Z","title":"MedicalNarratives: Connecting Medical Vision and Language with Localized\n Narratives","summary":" We propose MedicalNarratives, a dataset curated from medical pedagogical\nvideos similar in nature to data collected in Think-Aloud studies and inspired\nby Localized Narratives, which collects grounded image-text data by curating\ninstructors' speech and mouse cursor movements synchronized in time.\nMedicalNarratives enables pretraining of both semantic and dense objectives,\nalleviating the need to train medical semantic and dense tasks disparately due\nto the lack of reasonably sized datasets. Our dataset contains 4.7M image-text\npairs from videos and articles, with 1M samples containing dense annotations in\nthe form of traces and bounding boxes. To evaluate the utility of\nMedicalNarratives, we train GenMedClip based on the CLIP architecture using our\ndataset spanning 12 medical domains and demonstrate that it outperforms\nprevious state-of-the-art models on a newly constructed medical imaging\nbenchmark that comprehensively evaluates performance across all modalities.\nData, demo, code and models available at https://medical-narratives.github.io\n","authors":["Wisdom O. Ikezogwo","Kevin Zhang","Mehmet Saygin Seyfioglu","Fatemeh Ghezloo","Linda Shapiro","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2501.04184v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09792v3","updated":"2025-01-13T03:30:37Z","published":"2024-03-14T18:24:55Z","title":"Images are Achilles' Heel of Alignment: Exploiting Visual\n Vulnerabilities for Jailbreaking Multimodal Large Language Models","summary":" In this paper, we study the harmlessness alignment problem of multimodal\nlarge language models (MLLMs). We conduct a systematic empirical analysis of\nthe harmlessness performance of representative MLLMs and reveal that the image\ninput poses the alignment vulnerability of MLLMs. Inspired by this, we propose\na novel jailbreak method named HADES, which hides and amplifies the harmfulness\nof the malicious intent within the text input, using meticulously crafted\nimages. Experimental results show that HADES can effectively jailbreak existing\nMLLMs, which achieves an average Attack Success Rate (ASR) of 90.26% for\nLLaVA-1.5 and 71.60% for Gemini Pro Vision. Our code and data are available at\nhttps://github.com/RUCAIBox/HADES.\n","authors":["Yifan Li","Hangyu Guo","Kun Zhou","Wayne Xin Zhao","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2403.09792v3.pdf","comment":"ECCV 2024 Oral"},{"id":"http://arxiv.org/abs/2501.07033v1","updated":"2025-01-13T03:10:54Z","published":"2025-01-13T03:10:54Z","title":"Detection of AI Deepfake and Fraud in Online Payments Using GAN-Based\n Models","summary":" This study explores the use of Generative Adversarial Networks (GANs) to\ndetect AI deepfakes and fraudulent activities in online payment systems. With\nthe growing prevalence of deepfake technology, which can manipulate facial\nfeatures in images and videos, the potential for fraud in online transactions\nhas escalated. Traditional security systems struggle to identify these\nsophisticated forms of fraud. This research proposes a novel GAN-based model\nthat enhances online payment security by identifying subtle manipulations in\npayment images. The model is trained on a dataset consisting of real-world\nonline payment images and deepfake images generated using advanced GAN\narchitectures, such as StyleGAN and DeepFake. The results demonstrate that the\nproposed model can accurately distinguish between legitimate transactions and\ndeepfakes, achieving a high detection rate above 95%. This approach\nsignificantly improves the robustness of payment systems against AI-driven\nfraud. The paper contributes to the growing field of digital security, offering\ninsights into the application of GANs for fraud detection in financial\nservices. Keywords- Payment Security, Image Recognition, Generative Adversarial\nNetworks, AI Deepfake, Fraudulent Activities\n","authors":["Zong Ke","Shicheng Zhou","Yining Zhou","Chia Hong Chang","Rong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07033v1.pdf","comment":"The paper will be published and indexed by IEEE at 2025 8th\n International Conference on Advanced Algorithms and Control Engineering\n (ICAACE 2025)"},{"id":"http://arxiv.org/abs/2411.19714v2","updated":"2025-01-13T02:43:47Z","published":"2024-11-29T14:02:00Z","title":"The Streetscape Application Services Stack (SASS): Towards a Distributed\n Sensing Architecture for Urban Applications","summary":" As urban populations grow, cities are becoming more complex, driving the\ndeployment of interconnected sensing systems to realize the vision of smart\ncities. These systems aim to improve safety, mobility, and quality of life\nthrough applications that integrate diverse sensors with real-time\ndecision-making. Streetscape applications-focusing on challenges like\npedestrian safety and adaptive traffic management-depend on managing\ndistributed, heterogeneous sensor data, aligning information across time and\nspace, and enabling real-time processing. These tasks are inherently complex\nand often difficult to scale. The Streetscape Application Services Stack (SASS)\naddresses these challenges with three core services: multimodal data\nsynchronization, spatiotemporal data fusion, and distributed edge computing. By\nstructuring these capabilities as clear, composable abstractions with clear\nsemantics, SASS allows developers to scale streetscape applications efficiently\nwhile minimizing the complexity of multimodal integration.\n We evaluated SASS in two real-world testbed environments: a controlled\nparking lot and an urban intersection in a major U.S. city. These testbeds\nallowed us to test SASS under diverse conditions, demonstrating its practical\napplicability. The Multimodal Data Synchronization service reduced temporal\nmisalignment errors by 88%, achieving synchronization accuracy within 50\nmilliseconds. Spatiotemporal Data Fusion service improved detection accuracy\nfor pedestrians and vehicles by over 10%, leveraging multicamera integration.\nThe Distributed Edge Computing service increased system throughput by more than\nan order of magnitude. Together, these results show how SASS provides the\nabstractions and performance needed to support real-time, scalable urban\napplications, bridging the gap between sensing infrastructure and actionable\nstreetscape intelligence.\n","authors":["Navid Salami Pargoo","Mahshid Ghasemi","Shuren Xia","Mehmet Kerem Turkcan","Taqiya Ehsan","Chengbo Zang","Yuan Sun","Javad Ghaderi","Gil Zussman","Zoran Kostic","Jorge Ortiz"],"pdf_url":"https://arxiv.org/pdf/2411.19714v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05901v2","updated":"2025-01-13T02:34:19Z","published":"2025-01-10T11:53:46Z","title":"Valley2: Exploring Multimodal Models with Scalable Vision-Language\n Design","summary":" Recently, vision-language models have made remarkable progress, demonstrating\noutstanding capabilities in various tasks such as image captioning and video\nunderstanding. We introduce Valley2, a novel multimodal large language model\ndesigned to enhance performance across all domains and extend the boundaries of\npractical applications in e-commerce and short video scenarios. Notably,\nValley2 achieves state-of-the-art (SOTA) performance on e-commerce benchmarks,\nsurpassing open-source models of similar size by a large margin (79.66 vs.\n72.76). Additionally, Valley2 ranks second on the OpenCompass leaderboard among\nmodels with fewer than 10B parameters, with an impressive average score of\n67.4. The code and model weights are open-sourced at\nhttps://github.com/bytedance/Valley.\n","authors":["Ziheng Wu","Zhenghao Chen","Ruipu Luo","Can Zhang","Yuan Gao","Zhentao He","Xian Wang","Haoran Lin","Minghui Qiu"],"pdf_url":"https://arxiv.org/pdf/2501.05901v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07017v1","updated":"2025-01-13T02:33:28Z","published":"2025-01-13T02:33:28Z","title":"UNetVL: Enhancing 3D Medical Image Segmentation with Chebyshev KAN\n Powered Vision-LSTM","summary":" 3D medical image segmentation has progressed considerably due to\nConvolutional Neural Networks (CNNs) and Vision Transformers (ViTs), yet these\nmethods struggle to balance long-range dependency acquisition with\ncomputational efficiency. To address this challenge, we propose UNETVL (U-Net\nVision-LSTM), a novel architecture that leverages recent advancements in\ntemporal information processing. UNETVL incorporates Vision-LSTM (ViL) for\nimproved scalability and memory functions, alongside an efficient Chebyshev\nKolmogorov-Arnold Networks (KAN) to handle complex and long-range dependency\npatterns more effectively. We validated our method on the ACDC and AMOS2022\n(post challenge Task 2) benchmark datasets, showing a significant improvement\nin mean Dice score compared to recent state-of-the-art approaches, especially\nover its predecessor, UNETR, with increases of 7.3% on ACDC and 15.6% on AMOS,\nrespectively. Extensive ablation studies were conducted to demonstrate the\nimpact of each component in UNETVL, providing a comprehensive understanding of\nits architecture. Our code is available at https://github.com/tgrex6/UNETVL,\nfacilitating further research and applications in this domain.\n","authors":["Xuhui Guo","Tanmoy Dam","Rohan Dhamdhere","Gourav Modanwal","Anant Madabhushi"],"pdf_url":"https://arxiv.org/pdf/2501.07017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07016v1","updated":"2025-01-13T02:29:42Z","published":"2025-01-13T02:29:42Z","title":"A Multi-Modal Deep Learning Framework for Pan-Cancer Prognosis","summary":" Prognostic task is of great importance as it closely related to the survival\nanalysis of patients, the optimization of treatment plans and the allocation of\nresources. The existing prognostic models have shown promising results on\nspecific datasets, but there are limitations in two aspects. On the one hand,\nthey merely explore certain types of modal data, such as patient histopathology\nWSI and gene expression analysis. On the other hand, they adopt the\nper-cancer-per-model paradigm, which means the trained models can only predict\nthe prognostic effect of a single type of cancer, resulting in weak\ngeneralization ability. In this paper, a deep-learning based model, named\nUMPSNet, is proposed. Specifically, to comprehensively understand the condition\nof patients, in addition to constructing encoders for histopathology images and\ngenomic expression profiles respectively, UMPSNet further integrates four types\nof important meta data (demographic information, cancer type information,\ntreatment protocols, and diagnosis results) into text templates, and then\nintroduces a text encoder to extract textual features. In addition, the optimal\ntransport OT-based attention mechanism is utilized to align and fuse features\nof different modalities. Furthermore, a guided soft mixture of experts (GMoE)\nmechanism is introduced to effectively address the issue of distribution\ndifferences among multiple cancer datasets. By incorporating the multi-modality\nof patient data and joint training, UMPSNet outperforms all SOTA approaches,\nand moreover, it demonstrates the effectiveness and generalization ability of\nthe proposed learning paradigm of a single model for multiple cancer types. The\ncode of UMPSNet is available at https://github.com/binging512/UMPSNet.\n","authors":["Binyu Zhang","Shichao Li","Junpeng Jian","Zhu Meng","Limei Guo","Zhicheng Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.07016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07015v1","updated":"2025-01-13T02:28:13Z","published":"2025-01-13T02:28:13Z","title":"SplatMAP: Online Dense Monocular SLAM with 3D Gaussian Splatting","summary":" Achieving high-fidelity 3D reconstruction from monocular video remains\nchallenging due to the inherent limitations of traditional methods like\nStructure-from-Motion (SfM) and monocular SLAM in accurately capturing scene\ndetails. While differentiable rendering techniques such as Neural Radiance\nFields (NeRF) address some of these challenges, their high computational costs\nmake them unsuitable for real-time applications. Additionally, existing 3D\nGaussian Splatting (3DGS) methods often focus on photometric consistency,\nneglecting geometric accuracy and failing to exploit SLAM's dynamic depth and\npose updates for scene refinement. We propose a framework integrating dense\nSLAM with 3DGS for real-time, high-fidelity dense reconstruction. Our approach\nintroduces SLAM-Informed Adaptive Densification, which dynamically updates and\ndensifies the Gaussian model by leveraging dense point clouds from SLAM.\nAdditionally, we incorporate Geometry-Guided Optimization, which combines\nedge-aware geometric constraints and photometric consistency to jointly\noptimize the appearance and geometry of the 3DGS scene representation, enabling\ndetailed and accurate SLAM mapping reconstruction. Experiments on the Replica\nand TUM-RGBD datasets demonstrate the effectiveness of our approach, achieving\nstate-of-the-art results among monocular systems. Specifically, our method\nachieves a PSNR of 36.864, SSIM of 0.985, and LPIPS of 0.040 on Replica,\nrepresenting improvements of 10.7%, 6.4%, and 49.4%, respectively, over the\nprevious SOTA. On TUM-RGBD, our method outperforms the closest baseline by\n10.2%, 6.6%, and 34.7% in the same metrics. These results highlight the\npotential of our framework in bridging the gap between photometric and\ngeometric dense 3D scene representations, paving the way for practical and\nefficient monocular dense reconstruction.\n","authors":["Yue Hu","Rong Liu","Meida Chen","Andrew Feng","Peter Beerel"],"pdf_url":"https://arxiv.org/pdf/2501.07015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12185v4","updated":"2025-01-13T02:14:51Z","published":"2024-02-19T14:48:23Z","title":"ChartX & ChartVLM: A Versatile Benchmark and Foundation Model for\n Complicated Chart Reasoning","summary":" Recently, many versatile Multi-modal Large Language Models (MLLMs) have\nemerged continuously. However, their capacity to query information depicted in\nvisual charts and engage in reasoning based on the queried contents remains\nunder-explored. In this paper, to comprehensively and rigorously benchmark the\nability of the off-the-shelf MLLMs in the chart domain, we construct ChartX, a\nmulti-modal evaluation set covering 18 chart types, 7 chart tasks, 22\ndisciplinary topics, and high-quality chart data. Besides, we develop ChartVLM\nto offer a new perspective on handling multi-modal tasks that strongly depend\non interpretable patterns, such as reasoning tasks in the field of charts or\ngeometric images. We evaluate the chart-related ability of mainstream MLLMs and\nour ChartVLM on the proposed ChartX evaluation set. Extensive experiments\ndemonstrate that ChartVLM surpasses both versatile and chart-related large\nmodels, achieving results comparable to GPT-4V. We believe that our study can\npave the way for further exploration in creating a more comprehensive chart\nevaluation set and developing more interpretable multi-modal models. Both\nChartX and ChartVLM are available at:\nhttps://github.com/Alpha-Innovator/ChartVLM\n","authors":["Renqiu Xia","Bo Zhang","Hancheng Ye","Xiangchao Yan","Qi Liu","Hongbin Zhou","Zijun Chen","Min Dou","Botian Shi","Junchi Yan","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2402.12185v4.pdf","comment":"Code and dataset are available for downloading at:\n https://github.com/Alpha-Innovator/ChartVLM 25 pages, 15 figures"},{"id":"http://arxiv.org/abs/2501.02763v2","updated":"2025-01-13T01:21:29Z","published":"2025-01-06T05:14:40Z","title":"LDMapNet-U: An End-to-End System for City-Scale Lane-Level Map Updating","summary":" An up-to-date city-scale lane-level map is an indispensable infrastructure\nand a key enabling technology for ensuring the safety and user experience of\nautonomous driving systems. In industrial scenarios, reliance on manual\nannotation for map updates creates a critical bottleneck. Lane-level updates\nrequire precise change information and must ensure consistency with adjacent\ndata while adhering to strict standards. Traditional methods utilize a\nthree-stage approach-construction, change detection, and updating-which often\nnecessitates manual verification due to accuracy limitations. This results in\nlabor-intensive processes and hampers timely updates. To address these\nchallenges, we propose LDMapNet-U, which implements a new end-to-end paradigm\nfor city-scale lane-level map updating. By reconceptualizing the update task as\nan end-to-end map generation process grounded in historical map data, we\nintroduce a paradigm shift in map updating that simultaneously generates\nvectorized maps and change information. To achieve this, a Prior-Map Encoding\n(PME) module is introduced to effectively encode historical maps, serving as a\ncritical reference for detecting changes. Additionally, we incorporate a novel\nInstance Change Prediction (ICP) module that learns to predict associations\nwith historical maps. Consequently, LDMapNet-U simultaneously achieves\nvectorized map element generation and change detection. To demonstrate the\nsuperiority and effectiveness of LDMapNet-U, extensive experiments are\nconducted using large-scale real-world datasets. In addition, LDMapNet-U has\nbeen successfully deployed in production at Baidu Maps since April 2024,\nsupporting map updating for over 360 cities and significantly shortening the\nupdate cycle from quarterly to weekly. The updated maps serve hundreds of\nmillions of users and are integrated into the autonomous driving systems of\nseveral leading vehicle companies.\n","authors":["Deguo Xia","Weiming Zhang","Xiyan Liu","Wei Zhang","Chenting Gong","Xiao Tan","Jizhou Huang","Mengmeng Yang","Diange Yang"],"pdf_url":"https://arxiv.org/pdf/2501.02763v2.pdf","comment":"Accepted by KDD 2025, camera-ready version"},{"id":"http://arxiv.org/abs/2501.06986v1","updated":"2025-01-13T00:29:55Z","published":"2025-01-13T00:29:55Z","title":"LEO: Boosting Mixture of Vision Encoders for Multimodal Large Language\n Models","summary":" Enhanced visual understanding serves as a cornerstone for multimodal large\nlanguage models (MLLMs). Recent hybrid MLLMs incorporate a mixture of vision\nexperts to address the limitations of using a single vision encoder and\nexcessively long visual tokens. Despite the progress of these MLLMs, a research\ngap remains in effectively integrating diverse vision encoders. This work\nexplores fusion strategies of visual tokens for hybrid MLLMs, leading to the\ndesign of LEO, a novel MLLM with a dual-branch vision encoder framework that\nincorporates a post-adaptation fusion strategy and adaptive tiling: for each\nsegmented tile of the input images, LEO sequentially interleaves the visual\ntokens from its two vision encoders. Extensive evaluation across 13\nvision-language benchmarks reveals that LEO outperforms state-of-the-art\nopen-source MLLMs and hybrid MLLMs on the majority of tasks. Furthermore, we\nshow that LEO can be adapted to the specialized domain of autonomous driving\nwithout altering the model architecture or training recipe, achieving\ncompetitive performance compared to existing baselines. The code and model will\nbe publicly available.\n","authors":["Mozhgan Nasr Azadani","James Riddell","Sean Sedwards","Krzysztof Czarnecki"],"pdf_url":"https://arxiv.org/pdf/2501.06986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07754v1","updated":"2025-01-13T23:55:11Z","published":"2025-01-13T23:55:11Z","title":"Universal Training of Neural Networks to Achieve Bayes Optimal\n Classification Accuracy","summary":" This work invokes the notion of $f$-divergence to introduce a novel upper\nbound on the Bayes error rate of a general classification task. We show that\nthe proposed bound can be computed by sampling from the output of a\nparameterized model. Using this practical interpretation, we introduce the\nBayes optimal learning threshold (BOLT) loss whose minimization enforces a\nclassification model to achieve the Bayes error rate. We validate the proposed\nloss for image and text classification tasks, considering MNIST, Fashion-MNIST,\nCIFAR-10, and IMDb datasets. Numerical experiments demonstrate that models\ntrained with BOLT achieve performance on par with or exceeding that of\ncross-entropy, particularly on challenging datasets. This highlights the\npotential of BOLT in improving generalization.\n","authors":["Mohammadreza Tavasoli Naeini","Ali Bereyhi","Morteza Noshad","Ben Liang","Alfred O. Hero III"],"pdf_url":"https://arxiv.org/pdf/2501.07754v1.pdf","comment":"Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2410.20631v2","updated":"2025-01-13T23:45:51Z","published":"2024-10-27T23:29:46Z","title":"PViT: Prior-augmented Vision Transformer for Out-of-distribution\n Detection","summary":" Vision Transformers (ViTs) have achieved remarkable success over various\nvision tasks, yet their robustness against data distribution shifts and\ninherent inductive biases remain underexplored. To enhance the robustness of\nViT models for image Out-of-Distribution (OOD) detection, we introduce a novel\nand generic framework named Prior-augmented Vision Transformer (PViT). Taking\nas input the prior class logits from a pretrained model, we train PViT to\npredict the class logits. During inference, PViT identifies OOD samples by\nquantifying the divergence between the predicted class logits and the prior\nlogits obtained from pre-trained models. Unlike existing state-of-the-art(SOTA)\nOOD detection methods, PViT shapes the decision boundary between ID and OOD by\nutilizing the proposed prior guided confidence, without requiring additional\ndata modeling, generation methods, or structural modifications. Extensive\nexperiments on the large-scale ImageNet benchmark, evaluated against over seven\nOOD datasets, demonstrate that PViT significantly outperforms existing SOTA OOD\ndetection methods in terms of FPR95 and AUROC. The codebase is publicly\navailable at https://github.com/RanchoGoose/PViT.\n","authors":["Tianhao Zhang","Zhixiang Chen","Lyudmila S. Mihaylova"],"pdf_url":"https://arxiv.org/pdf/2410.20631v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00142v2","updated":"2025-01-13T23:45:26Z","published":"2024-11-28T18:55:41Z","title":"Sparse Attention Vectors: Generative Multimodal Model Features Are\n Discriminative Vision-Language Classifiers","summary":" Generative Large Multimodal Models (LMMs) like LLaVA and Qwen-VL excel at a\nwide variety of vision-language (VL) tasks such as image captioning or visual\nquestion answering. Despite strong performance, LMMs are not directly suited\nfor foundational discriminative vision-language tasks (i.e., tasks requiring\ndiscrete label predictions) such as image classification and multiple-choice\nVQA. One key challenge in utilizing LMMs for discriminative tasks is the\nextraction of useful features from generative models. To overcome this issue,\nwe propose an approach for finding features in the model's latent space to more\neffectively leverage LMMs for discriminative tasks. Toward this end, we present\nSparse Attention Vectors (SAVs) -- a finetuning-free method that leverages\nsparse attention head activations (fewer than 1\\% of the heads) in LMMs as\nstrong features for VL tasks. With only few-shot examples, SAVs demonstrate\nstate-of-the-art performance compared to a variety of few-shot and finetuned\nbaselines on a collection of discriminative tasks. Our experiments also imply\nthat SAVs can scale in performance with additional examples and generalize to\nsimilar tasks, establishing SAVs as both effective and robust multimodal\nfeature representations.\n","authors":["Chancharik Mitra","Brandon Huang","Tianning Chai","Zhiqiu Lin","Assaf Arbelle","Rogerio Feris","Leonid Karlinsky","Trevor Darrell","Deva Ramanan","Roei Herzig"],"pdf_url":"https://arxiv.org/pdf/2412.00142v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07750v1","updated":"2025-01-13T23:38:49Z","published":"2025-01-13T23:38:49Z","title":"Boosting Sclera Segmentation through Semi-supervised Learning with Fewer\n Labels","summary":" Sclera segmentation is crucial for developing automatic eye-related medical\ncomputer-aided diagnostic systems, as well as for personal identification and\nverification, because the sclera contains distinct personal features. Deep\nlearning-based sclera segmentation has achieved significant success compared to\ntraditional methods that rely on hand-crafted features, primarily because it\ncan autonomously extract critical output-related features without the need to\nconsider potential physical constraints. However, achieving accurate sclera\nsegmentation using these methods is challenging due to the scarcity of\nhigh-quality, fully labeled datasets, which depend on costly, labor-intensive\nmedical acquisition and expertise. To address this challenge, this paper\nintroduces a novel sclera segmentation framework that excels with limited\nlabeled samples. Specifically, we employ a semi-supervised learning method that\nintegrates domain-specific improvements and image-based spatial transformations\nto enhance segmentation performance. Additionally, we have developed a\nreal-world eye diagnosis dataset to enrich the evaluation process. Extensive\nexperiments on our dataset and two additional public datasets demonstrate the\neffectiveness and superiority of our proposed method, especially with\nsignificantly fewer labeled samples.\n","authors":["Guanjun Wang","Lu Wang","Ning Niu","Qiaoyi Yao","Yixuan Wang","Sufen Ren","Shengchao Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07750v1.pdf","comment":"Under review, 19 pages, 9 figures, 4 tables"},{"id":"http://arxiv.org/abs/2501.07746v1","updated":"2025-01-13T23:21:33Z","published":"2025-01-13T23:21:33Z","title":"A Heterogeneous Multimodal Graph Learning Framework for Recognizing User\n Emotions in Social Networks","summary":" The rapid expansion of social media platforms has provided unprecedented\naccess to massive amounts of multimodal user-generated content. Comprehending\nuser emotions can provide valuable insights for improving communication and\nunderstanding of human behaviors. Despite significant advancements in Affective\nComputing, the diverse factors influencing user emotions in social networks\nremain relatively understudied. Moreover, there is a notable lack of deep\nlearning-based methods for predicting user emotions in social networks, which\ncould be addressed by leveraging the extensive multimodal data available. This\nwork presents a novel formulation of personalized emotion prediction in social\nnetworks based on heterogeneous graph learning. Building upon this formulation,\nwe design HMG-Emo, a Heterogeneous Multimodal Graph Learning Framework that\nutilizes deep learning-based features for user emotion recognition.\nAdditionally, we include a dynamic context fusion module in HMG-Emo that is\ncapable of adaptively integrating the different modalities in social media\ndata. Through extensive experiments, we demonstrate the effectiveness of\nHMG-Emo and verify the superiority of adopting a graph neural network-based\napproach, which outperforms existing baselines that use rich hand-crafted\nfeatures. To the best of our knowledge, HMG-Emo is the first multimodal and\ndeep-learning-based approach to predict personalized emotions within online\nsocial networks. Our work highlights the significance of exploiting advanced\ndeep learning techniques for less-explored problems in Affective Computing.\n","authors":["Sree Bhattacharyya","Shuhua Yang","James Z. Wang"],"pdf_url":"https://arxiv.org/pdf/2501.07746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07742v1","updated":"2025-01-13T23:13:33Z","published":"2025-01-13T23:13:33Z","title":"Fixing the Scale and Shift in Monocular Depth For Camera Pose Estimation","summary":" Recent advances in monocular depth prediction have led to significantly\nimproved depth prediction accuracy. In turn, this enables various applications\nto use such depth predictions. In this paper, we propose a novel framework for\nestimating the relative pose between two cameras from point correspondences\nwith associated monocular depths. Since depth predictions are typically defined\nup to an unknown scale and shift parameter, our solvers jointly estimate both\nscale and shift parameters together with the camera pose. We derive efficient\nsolvers for three cases: (1) two calibrated cameras, (2) two uncalibrated\ncameras with an unknown but shared focal length, and (3) two uncalibrated\ncameras with unknown and different focal lengths. Experiments on synthetic and\nreal data, including experiments with depth maps estimated by 11 different\ndepth predictors, show the practical viability of our solvers. Compared to\nprior work, our solvers achieve state-of-the-art results on two large-scale,\nreal-world datasets. The source code is available at\nhttps://github.com/yaqding/pose_monodepth\n","authors":["Yaqing Ding","Václav Vávra","Viktor Kocur","Jian Yang","Torsten Sattler","Zuzana Kukelova"],"pdf_url":"https://arxiv.org/pdf/2501.07742v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2501.07730v1","updated":"2025-01-13T22:37:17Z","published":"2025-01-13T22:37:17Z","title":"Democratizing Text-to-Image Masked Generative Models with Compact\n Text-Aware One-Dimensional Tokens","summary":" Image tokenizers form the foundation of modern text-to-image generative\nmodels but are notoriously difficult to train. Furthermore, most existing\ntext-to-image models rely on large-scale, high-quality private datasets, making\nthem challenging to replicate. In this work, we introduce Text-Aware\nTransformer-based 1-Dimensional Tokenizer (TA-TiTok), an efficient and powerful\nimage tokenizer that can utilize either discrete or continuous 1-dimensional\ntokens. TA-TiTok uniquely integrates textual information during the tokenizer\ndecoding stage (i.e., de-tokenization), accelerating convergence and enhancing\nperformance. TA-TiTok also benefits from a simplified, yet effective, one-stage\ntraining process, eliminating the need for the complex two-stage distillation\nused in previous 1-dimensional tokenizers. This design allows for seamless\nscalability to large datasets. Building on this, we introduce a family of\ntext-to-image Masked Generative Models (MaskGen), trained exclusively on open\ndata while achieving comparable performance to models trained on private data.\nWe aim to release both the efficient, strong TA-TiTok tokenizers and the\nopen-data, open-weight MaskGen models to promote broader access and democratize\nthe field of text-to-image masked generative models.\n","authors":["Dongwon Kim","Ju He","Qihang Yu","Chenglin Yang","Xiaohui Shen","Suha Kwak","Liang-Chieh Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07730v1.pdf","comment":"Project page at https://tacju.github.io/projects/maskgen.html"},{"id":"http://arxiv.org/abs/2404.12652v2","updated":"2025-01-13T21:59:56Z","published":"2024-04-19T06:41:32Z","title":"Pre-trained Vision-Language Models Learn Discoverable Visual Concepts","summary":" Do vision-language models (VLMs) pre-trained to caption an image of a\n\"durian\" learn visual concepts such as \"brown\" (color) and \"spiky\" (texture) at\nthe same time? We aim to answer this question as visual concepts learned \"for\nfree\" would enable wide applications such as neuro-symbolic reasoning or\nhuman-interpretable object classification. We assume that the visual concepts,\nif captured by pre-trained VLMs, can be extracted by their vision-language\ninterface with text-based concept prompts. We observe that recent works\nprompting VLMs with concepts often differ in their strategies to define and\nevaluate the visual concepts, leading to conflicting conclusions. We propose a\nnew concept definition strategy based on two observations: First, certain\nconcept prompts include shortcuts that recognize correct concepts for wrong\nreasons; Second, multimodal information (e.g. visual discriminativeness, and\ntextual knowledge) should be leveraged when selecting the concepts. Our\nproposed concept discovery and learning (CDL) framework is thus designed to\nidentify a diverse list of generic visual concepts (e.g. \"spiky\" as opposed to\n\"spiky durian\"), which are ranked and selected based on visual and language\nmutual information. We carefully design quantitative and human evaluations of\nthe discovered concepts on six diverse visual recognition datasets, which\nconfirm that pre-trained VLMs do learn visual concepts that provide accurate\nand thorough descriptions for the recognized objects. All code and models are\npublicly released.\n","authors":["Yuan Zang","Tian Yun","Hao Tan","Trung Bui","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2404.12652v2.pdf","comment":"Transactions on Machine Learning Research, 2025"},{"id":"http://arxiv.org/abs/2501.07713v1","updated":"2025-01-13T21:52:46Z","published":"2025-01-13T21:52:46Z","title":"Testing Human-Hand Segmentation on In-Distribution and\n Out-of-Distribution Data in Human-Robot Interactions Using a Deep Ensemble\n Model","summary":" Reliable detection and segmentation of human hands are critical for enhancing\nsafety and facilitating advanced interactions in human-robot collaboration.\nCurrent research predominantly evaluates hand segmentation under\nin-distribution (ID) data, which reflects the training data of deep learning\n(DL) models. However, this approach fails to address out-of-distribution (OOD)\nscenarios that often arise in real-world human-robot interactions. In this\nstudy, we present a novel approach by evaluating the performance of pre-trained\nDL models under both ID data and more challenging OOD scenarios. To mimic\nrealistic industrial scenarios, we designed a diverse dataset featuring simple\nand cluttered backgrounds with industrial tools, varying numbers of hands (0 to\n4), and hands with and without gloves. For OOD scenarios, we incorporated\nunique and rare conditions such as finger-crossing gestures and motion blur\nfrom fast-moving hands, addressing both epistemic and aleatoric uncertainties.\nTo ensure multiple point of views (PoVs), we utilized both egocentric cameras,\nmounted on the operator's head, and static cameras to capture RGB images of\nhuman-robot interactions. This approach allowed us to account for multiple\ncamera perspectives while also evaluating the performance of models trained on\nexisting egocentric datasets as well as static-camera datasets. For\nsegmentation, we used a deep ensemble model composed of UNet and RefineNet as\nbase learners. Performance evaluation was conducted using segmentation metrics\nand uncertainty quantification via predictive entropy. Results revealed that\nmodels trained on industrial datasets outperformed those trained on\nnon-industrial datasets, highlighting the importance of context-specific\ntraining. Although all models struggled with OOD scenarios, those trained on\nindustrial datasets demonstrated significantly better generalization.\n","authors":["Reza Jalayer","Yuxin Chen","Masoud Jalayer","Carlotta Orsenigo","Masayoshi Tomizuka"],"pdf_url":"https://arxiv.org/pdf/2501.07713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07711v1","updated":"2025-01-13T21:45:01Z","published":"2025-01-13T21:45:01Z","title":"Pedestrian Trajectory Prediction Based on Social Interactions Learning\n With Random Weights","summary":" Pedestrian trajectory prediction is a critical technology in the evolution of\nself-driving cars toward complete artificial intelligence. Over recent years,\nfocusing on the trajectories of pedestrians to model their social interactions\nhas surged with great interest in more accurate trajectory predictions.\nHowever, existing methods for modeling pedestrian social interactions rely on\npre-defined rules, struggling to capture non-explicit social interactions. In\nthis work, we propose a novel framework named DTGAN, which extends the\napplication of Generative Adversarial Networks (GANs) to graph sequence data,\nwith the primary objective of automatically capturing implicit social\ninteractions and achieving precise predictions of pedestrian trajectory. DTGAN\ninnovatively incorporates random weights within each graph to eliminate the\nneed for pre-defined interaction rules. We further enhance the performance of\nDTGAN by exploring diverse task loss functions during adversarial training,\nwhich yields improvements of 16.7\\% and 39.3\\% on metrics ADE and FDE,\nrespectively. The effectiveness and accuracy of our framework are verified on\ntwo public datasets. The experimental results show that our proposed DTGAN\nachieves superior performance and is well able to understand pedestrians'\nintentions.\n","authors":["Jiajia Xie","Sheng Zhang","Beihao Xia","Zhu Xiao","Hongbo Jiang","Siwang Zhou","Zheng Qin","Hongyang Chen"],"pdf_url":"https://arxiv.org/pdf/2501.07711v1.pdf","comment":"13 pages,7 figures,Accepted to IEEE Transactions on Multimedia (TMM)"},{"id":"http://arxiv.org/abs/2404.00427v2","updated":"2025-01-13T21:20:02Z","published":"2024-03-30T17:21:07Z","title":"Extracting Manifold Information from Point Clouds","summary":" A kernel based method is proposed for the construction of signature\n(defining) functions of subsets of $\\mathbb{R}^d$. The subsets can range from\nfull dimensional manifolds (open subsets) to point clouds (a finite number of\npoints) and include bounded smooth manifolds of any codimension. The\ninterpolation and analysis of point clouds are the main application. Two\nextreme cases in terms of regularity are considered, where the data set is\ninterpolated by an analytic surface, at the one extreme, and by a H\\\"older\ncontinuous surface, at the other. The signature function can be computed as a\nlinear combination of translated kernels, the coefficients of which are the\nsolution of a finite dimensional linear problem. Once it is obtained, it can be\nused to estimate the dimension as well as the normal and the curvatures of the\ninterpolated surface. The method is global and does not require explicit\nknowledge of local neighborhoods or any other structure present in the data\nset. It admits a variational formulation with a natural ``regularized''\ncounterpart, that proves to be useful in dealing with data sets corrupted by\nnumerical error or noise. The underlying analytical structure of the approach\nis presented in general before it is applied to the case of point clouds.\n","authors":["Patrick Guidotti"],"pdf_url":"https://arxiv.org/pdf/2404.00427v2.pdf","comment":"27 pages, 16 figures, 5 tables"},{"id":"http://arxiv.org/abs/2501.07688v1","updated":"2025-01-13T21:04:37Z","published":"2025-01-13T21:04:37Z","title":"C2PD: Continuity-Constrained Pixelwise Deformation for Guided Depth\n Super-Resolution","summary":" Guided depth super-resolution (GDSR) has demonstrated impressive performance\nacross a wide range of domains, with numerous methods being proposed. However,\nexisting methods often treat depth maps as images, where shading values are\ncomputed discretely, making them struggle to effectively restore the continuity\ninherent in the depth map. In this paper, we propose a novel approach that\nmaximizes the utilization of spatial characteristics in depth, coupled with\nhuman abstract perception of real-world substance, by transforming the GDSR\nissue into deformation of a roughcast with ideal plasticity, which can be\ndeformed by force like a continuous object. Specifically, we firstly designed a\ncross-modal operation, Continuity-constrained Asymmetrical Pixelwise Operation\n(CAPO), which can mimic the process of deforming an isovolumetrically flexible\nobject through external forces. Utilizing CAPO as the fundamental component, we\ndevelop the Pixelwise Cross Gradient Deformation (PCGD), which is capable of\nemulating operations on ideal plastic objects (without volume constraint).\nNotably, our approach demonstrates state-of-the-art performance across four\nwidely adopted benchmarks for GDSR, with significant advantages in large-scale\ntasks and generalizability.\n","authors":["Jiahui Kang","Qing Cai","Runqing Tan","Yimei Liu","Zhi Liu"],"pdf_url":"https://arxiv.org/pdf/2501.07688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07681v1","updated":"2025-01-13T20:41:52Z","published":"2025-01-13T20:41:52Z","title":"Dataset Distillation as Pushforward Optimal Quantization","summary":" Dataset distillation aims to find a synthetic training set such that training\non the synthetic data achieves similar performance to training on real data,\nwith orders of magnitude less computational requirements. Existing methods can\nbe broadly categorized as either bi-level optimization problems that have\nneural network training heuristics as the lower level problem, or disentangled\nmethods that bypass the bi-level optimization by matching distributions of\ndata. The latter method has the major advantages of speed and scalability in\nterms of size of both training and distilled datasets. We demonstrate that when\nequipped with an encoder-decoder structure, the empirically successful\ndisentangled methods can be reformulated as an optimal quantization problem,\nwhere a finite set of points is found to approximate the underlying probability\nmeasure by minimizing the expected projection distance. In particular, we link\nexisting disentangled dataset distillation methods to the classical optimal\nquantization and Wasserstein barycenter problems, demonstrating consistency of\ndistilled datasets for diffusion-based generative priors. We propose a simple\nextension of the state-of-the-art data distillation method D4M, achieving\nbetter performance on the ImageNet-1K dataset with trivial additional\ncomputation, and state-of-the-art performance in higher image-per-class\nsettings.\n","authors":["Hong Ye Tan","Emma Slade"],"pdf_url":"https://arxiv.org/pdf/2501.07681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02052v4","updated":"2025-01-13T19:51:53Z","published":"2024-10-02T21:42:35Z","title":"ExACT: Teaching AI Agents to Explore with Reflective-MCTS and\n Exploratory Learning","summary":" Autonomous agents have demonstrated significant potential in automating\ncomplex multistep decision-making tasks. However, even state-of-the-art\nvision-language models (VLMs), such as GPT-4o, still fall short of human-level\nperformance, particularly in intricate web environments and long-horizon tasks.\nTo address these limitations, we present ExACT, an approach to combine\ntest-time search and self-learning to build o1-like models for agentic\napplications. We first introduce Reflective Monte Carlo Tree Search (R-MCTS), a\nnovel test time algorithm designed to enhance AI agents' ability to explore\ndecision space on the fly. R-MCTS extends traditional MCTS by 1) incorporating\ncontrastive reflection, allowing agents to learn from past interactions and\ndynamically improve their search efficiency; and 2) using multi-agent debate\nfor reliable state evaluation. Next, we introduce Exploratory Learning, a novel\nlearning strategy to teach agents to search at inference time without relying\non any external search algorithms. On the challenging VisualWebArena benchmark,\nour GPT-4o based R-MCTS agent achieves a 6% to 30% relative improvement across\nvarious tasks compared to the previous state-of-the-art. Additionally, we show\nthat the knowledge and experience gained from test-time search can be\neffectively transferred back to GPT-4o via fine-tuning. After Exploratory\nLearning, GPT-4o 1) demonstrates the ability to explore the environment,\nevaluate a state, and backtrack to viable ones when it detects that the current\nstate cannot lead to success, and 2) matches 87% of R-MCTS's performance while\nusing significantly less compute. Notably, our work demonstrates the compute\nscaling properties in both training - data collection with R-MCTS - and testing\ntime. These results suggest a promising research direction to enhance VLMs'\ncapabilities for agentic applications via test-time search and self-learning.\n","authors":["Xiao Yu","Baolin Peng","Vineeth Vajipey","Hao Cheng","Michel Galley","Jianfeng Gao","Zhou Yu"],"pdf_url":"https://arxiv.org/pdf/2410.02052v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07647v1","updated":"2025-01-13T19:17:06Z","published":"2025-01-13T19:17:06Z","title":"BlobGEN-Vid: Compositional Text-to-Video Generation with Blob Video\n Representations","summary":" Existing video generation models struggle to follow complex text prompts and\nsynthesize multiple objects, raising the need for additional grounding input\nfor improved controllability. In this work, we propose to decompose videos into\nvisual primitives - blob video representation, a general representation for\ncontrollable video generation. Based on blob conditions, we develop a\nblob-grounded video diffusion model named BlobGEN-Vid that allows users to\ncontrol object motions and fine-grained object appearance. In particular, we\nintroduce a masked 3D attention module that effectively improves regional\nconsistency across frames. In addition, we introduce a learnable module to\ninterpolate text embeddings so that users can control semantics in specific\nframes and obtain smooth object transitions. We show that our framework is\nmodel-agnostic and build BlobGEN-Vid based on both U-Net and DiT-based video\ndiffusion models. Extensive experimental results show that BlobGEN-Vid achieves\nsuperior zero-shot video generation ability and state-of-the-art layout\ncontrollability on multiple benchmarks. When combined with an LLM for layout\nplanning, our framework even outperforms proprietary text-to-video generators\nin terms of compositional accuracy.\n","authors":["Weixi Feng","Chao Liu","Sifei Liu","William Yang Wang","Arash Vahdat","Weili Nie"],"pdf_url":"https://arxiv.org/pdf/2501.07647v1.pdf","comment":"Project page: https://blobgen-vid2.github.io/"}]},"2025-01-12T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2412.20758v2","updated":"2025-01-12T21:57:58Z","published":"2024-12-30T07:03:54Z","title":"High-Sensitivity Vision-Based Tactile Sensing Enhanced by\n Microstructures and Lightweight CNN","summary":" Tactile sensing is critical in advanced interactive systems by emulating the\nhuman sense of touch to detect stimuli. Vision-based tactile sensors (VBTSs)\nare promising for their ability to provide rich information, robustness,\nadaptability, low cost, and multimodal capabilities. However, current\ntechnologies still have limitations in sensitivity, spatial resolution, and the\nhigh computational demands of deep learning-based image processing. This paper\npresents a comprehensive approach combining a novel sensor structure with\nmicromachined structures and an efficient image processing method, and\ndemonstrates that carefully engineered microstructures within the sensor\nhardware can significantly enhance sensitivity while reducing computational\nload. Unlike traditional designs with tracking markers, our sensor incorporates\nan interface surface with micromachined trenches, as an example of\nmicrostructures, which modulate light transmission and amplify the variation in\nresponse to applied force. By capturing variations in brightness, wire width,\nand cross pattern locations with a camera, the sensor accurately infers the\ncontact location, the magnitude of displacement and applied force with a\nlightweight convolutional neural network (CNN). Theoretical and experimental\nresults demonstrated that the microstructures significantly enhance sensitivity\nby amplifying the visual effects of shape distortion. The sensor system\neffectively detected forces below 10 mN, and achieved a millimetre-level\nsingle-point spatial resolution. Using a model with only one convolutional\nlayer, a mean absolute error (MAE) below 0.05 mm have been achieved. Its soft\nsensor body ensures compatibility with soft robots and wearable electronics,\nwhile its immunity to electrical crosstalk and interference guarantees\nreliability in complex human-machine environments.\n","authors":["Mayue Shi","Yongqi Zhang","Xiaotong Guo","Eric M. Yeatman"],"pdf_url":"https://arxiv.org/pdf/2412.20758v2.pdf","comment":"27 pages, 13 figures, 2 tables; rearranged figures; corrected typos"},{"id":"http://arxiv.org/abs/2501.06946v1","updated":"2025-01-12T21:46:57Z","published":"2025-01-12T21:46:57Z","title":"Learning Implicit Social Navigation Behavior using Deep Inverse\n Reinforcement Learning","summary":" This paper reports on learning a reward map for social navigation in dynamic\nenvironments where the robot can reason about its path at any time, given\nagents' trajectories and scene geometry. Humans navigating in dense and dynamic\nindoor environments often work with several implied social rules. A rule-based\napproach fails to model all possible interactions between humans, robots, and\nscenes. We propose a novel Smooth Maximum Entropy Deep Inverse Reinforcement\nLearning (S-MEDIRL) algorithm that can extrapolate beyond expert demos to\nbetter encode scene navigability from few-shot demonstrations. The agent learns\nto predict the cost maps reasoning on trajectory data and scene geometry. The\nagent samples a trajectory that is then executed using a local crowd navigation\ncontroller. We present results in a photo-realistic simulation environment,\nwith a robot and a human navigating a narrow crossing scenario. The robot\nimplicitly learns to exhibit social behaviors such as yielding to oncoming\ntraffic and avoiding deadlocks. We compare the proposed approach to the popular\nmodel-based crowd navigation algorithm ORCA and a rule-based agent that\nexhibits yielding.\n","authors":["Tribhi Kathuria","Ke Liu","Junwoo Jang","X. Jessie Yang","Maani Ghaffari"],"pdf_url":"https://arxiv.org/pdf/2501.06946v1.pdf","comment":"8 pages, Submitted to IEEE Robotics and Automation Letters (RAL)"},{"id":"http://arxiv.org/abs/2408.14518v2","updated":"2025-01-12T21:01:00Z","published":"2024-08-26T00:13:14Z","title":"A Survey on Reinforcement Learning Applications in SLAM","summary":" The emergence of mobile robotics, particularly in the automotive industry,\nintroduces a promising era of enriched user experiences and adept handling of\ncomplex navigation challenges. The realization of these advancements\nnecessitates a focused technological effort and the successful execution of\nnumerous intricate tasks, particularly in the critical domain of Simultaneous\nLocalization and Mapping (SLAM). Various artificial intelligence (AI)\nmethodologies, such as deep learning and reinforcement learning, present viable\nsolutions to address the challenges in SLAM. This study specifically explores\nthe application of reinforcement learning in the context of SLAM. By enabling\nthe agent (the robot) to iteratively interact with and receive feedback from\nits environment, reinforcement learning facilitates the acquisition of\nnavigation and mapping skills, thereby enhancing the robot's decision-making\ncapabilities. This approach offers several advantages, including improved\nnavigation proficiency, increased resilience, reduced dependence on sensor\nprecision, and refinement of the decision-making process. The findings of this\nstudy, which provide an overview of reinforcement learning's utilization in\nSLAM, reveal significant advancements in the field. The investigation also\nhighlights the evolution and innovative integration of these techniques.\n","authors":["Mohammad Dehghani Tezerjani","Mohammad Khoshnazar","Mohammadhamed Tangestanizadeh","Arman Kiani","Qing Yang"],"pdf_url":"https://arxiv.org/pdf/2408.14518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06919v1","updated":"2025-01-12T20:07:22Z","published":"2025-01-12T20:07:22Z","title":"Shake-VLA: Vision-Language-Action Model-Based System for Bimanual\n Robotic Manipulations and Liquid Mixing","summary":" This paper introduces Shake-VLA, a Vision-Language-Action (VLA) model-based\nsystem designed to enable bimanual robotic manipulation for automated cocktail\npreparation. The system integrates a vision module for detecting ingredient\nbottles and reading labels, a speech-to-text module for interpreting user\ncommands, and a language model to generate task-specific robotic instructions.\nForce Torque (FT) sensors are employed to precisely measure the quantity of\nliquid poured, ensuring accuracy in ingredient proportions during the mixing\nprocess. The system architecture includes a Retrieval-Augmented Generation\n(RAG) module for accessing and adapting recipes, an anomaly detection mechanism\nto address ingredient availability issues, and bimanual robotic arms for\ndexterous manipulation. Experimental evaluations demonstrated a high success\nrate across system components, with the speech-to-text module achieving a 93%\nsuccess rate in noisy environments, the vision module attaining a 91% success\nrate in object and label detection in cluttered environment, the anomaly module\nsuccessfully identified 95% of discrepancies between detected ingredients and\nrecipe requirements, and the system achieved an overall success rate of 100% in\npreparing cocktails, from recipe formulation to action generation.\n","authors":["Muhamamd Haris Khan","Selamawit Asfaw","Dmitrii Iarchuk","Miguel Altamirano Cabrera","Luis Moreno","Issatay Tokmurziyev","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2501.06919v1.pdf","comment":"Accepted to IEEE/ACM HRI 2025"},{"id":"http://arxiv.org/abs/2501.04693v2","updated":"2025-01-12T20:02:27Z","published":"2025-01-08T18:57:33Z","title":"Beyond Sight: Finetuning Generalist Robot Policies with Heterogeneous\n Sensors via Language Grounding","summary":" Interacting with the world is a multi-sensory experience: achieving effective\ngeneral-purpose interaction requires making use of all available modalities --\nincluding vision, touch, and audio -- to fill in gaps from partial observation.\nFor example, when vision is occluded reaching into a bag, a robot should rely\non its senses of touch and sound. However, state-of-the-art generalist robot\npolicies are typically trained on large datasets to predict robot actions\nsolely from visual and proprioceptive observations. In this work, we propose\nFuSe, a novel approach that enables finetuning visuomotor generalist policies\non heterogeneous sensor modalities for which large datasets are not readily\navailable by leveraging natural language as a common cross-modal grounding. We\ncombine a multimodal contrastive loss with a sensory-grounded language\ngeneration loss to encode high-level semantics. In the context of robot\nmanipulation, we show that FuSe enables performing challenging tasks that\nrequire reasoning jointly over modalities such as vision, touch, and sound in a\nzero-shot setting, such as multimodal prompting, compositional cross-modal\nprompting, and descriptions of objects it interacts with. We show that the same\nrecipe is applicable to widely different generalist policies, including both\ndiffusion-based generalist policies and large vision-language-action (VLA)\nmodels. Extensive experiments in the real world show that FuSeis able to\nincrease success rates by over 20% compared to all considered baselines.\n","authors":["Joshua Jones","Oier Mees","Carmelo Sferrazza","Kyle Stachowicz","Pieter Abbeel","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2501.04693v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06904v1","updated":"2025-01-12T19:05:44Z","published":"2025-01-12T19:05:44Z","title":"From Simulation to Field: Learning Terrain Traversability for Real-World\n Deployment","summary":" The challenge of traversability estimation is a crucial aspect of autonomous\nnavigation in unstructured outdoor environments such as forests. It involves\ndetermining whether certain areas are passable or risky for robots, taking into\naccount factors like terrain irregularities, slopes, and potential obstacles.\nThe majority of current methods for traversability estimation operate on the\nassumption of an offline computation, overlooking the significant influence of\nthe robot's heading direction on accurate traversability estimates. In this\nwork, we introduce a deep neural network that uses detailed geometric\nenvironmental data together with the robot's recent movement characteristics.\nThis fusion enables the generation of robot direction awareness and continuous\ntraversability estimates, essential for enhancing robot autonomy in challenging\nterrains like dense forests. The efficacy and significance of our approach are\nunderscored by experiments conducted on both simulated and real robotic\nplatforms in various environments, yielding quantitatively superior performance\nresults compared to existing methods. Moreover, we demonstrate that our method,\ntrained exclusively in a high-fidelity simulated setting, can accurately\npredict traversability in real-world applications without any real data\ncollection. Our experiments showcase the advantages of our method for\noptimizing path-planning and exploration tasks within difficult outdoor\nenvironments, underscoring its practicality for effective, real-world robotic\nnavigation. In the spirit of collaborative advancement, we have made the code\nimplementation available to the public.\n","authors":["Fetullah Atas","Grzegorz Cielniak","Lars Grimstad"],"pdf_url":"https://arxiv.org/pdf/2501.06904v1.pdf","comment":"38 pages"},{"id":"http://arxiv.org/abs/2501.06897v1","updated":"2025-01-12T18:38:51Z","published":"2025-01-12T18:38:51Z","title":"ActiveGAMER: Active GAussian Mapping through Efficient Rendering","summary":" We introduce ActiveGAMER, an active mapping system that utilizes 3D Gaussian\nSplatting (3DGS) to achieve high-quality, real-time scene mapping and\nexploration. Unlike traditional NeRF-based methods, which are computationally\ndemanding and restrict active mapping performance, our approach leverages the\nefficient rendering capabilities of 3DGS, allowing effective and efficient\nexploration in complex environments. The core of our system is a\nrendering-based information gain module that dynamically identifies the most\ninformative viewpoints for next-best-view planning, enhancing both geometric\nand photometric reconstruction accuracy. ActiveGAMER also integrates a\ncarefully balanced framework, combining coarse-to-fine exploration,\npost-refinement, and a global-local keyframe selection strategy to maximize\nreconstruction completeness and fidelity. Our system autonomously explores and\nreconstructs environments with state-of-the-art geometric and photometric\naccuracy and completeness, significantly surpassing existing approaches in both\naspects. Extensive evaluations on benchmark datasets such as Replica and MP3D\nhighlight ActiveGAMER's effectiveness in active mapping tasks.\n","authors":["Liyan Chen","Huangying Zhan","Kevin Chen","Xiangyu Xu","Qingan Yan","Changjiang Cai","Yi Xu"],"pdf_url":"https://arxiv.org/pdf/2501.06897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06867v1","updated":"2025-01-12T16:31:53Z","published":"2025-01-12T16:31:53Z","title":"Toward a Universal Concept of Artificial Personality: Implementing\n Robotic Personality in a Kinova Arm","summary":" The fundamental role of personality in shaping interactions is increasingly\nbeing exploited in robotics. A carefully designed robotic personality has been\nshown to improve several key aspects of Human-Robot Interaction (HRI). However,\nthe fragmentation and rigidity of existing approaches reveal even greater\nchallenges when applied to non-humanoid robots. On one hand, the state of the\nart is very dispersed; on the other hand, Industry 4.0 is moving towards a\nfuture where humans and industrial robots are going to coexist. In this\ncontext, the proper design of a robotic personality can lead to more successful\ninteractions. This research takes a first step in that direction by integrating\na comprehensive cognitive architecture built upon the definition of robotic\npersonality - validated on humanoid robots - into a robotic Kinova Jaco2 arm.\nThe robot personality is defined through the cognitive architecture as a vector\nin the three-dimensional space encompassing Conscientiousness, Extroversion,\nand Agreeableness, affecting how actions are executed, the action selection\nprocess, and the internal reaction to environmental stimuli. Our main objective\nis to determine whether users perceive distinct personalities in the robot,\nregardless of its shape, and to understand the role language plays in shaping\nthese perceptions. To achieve this, we conducted a user study comprising 144\nsessions of a collaborative game between a Kinova Jaco2 arm and participants,\nwhere the robot's behavior was influenced by its assigned personality.\nFurthermore, we compared two conditions: in the first, the robot communicated\nsolely through gestures and action choices, while in the second, it also\nutilized verbal interaction.\n","authors":["Alice Nardelli","Lorenzo Landolfi","Dario Pasquali","Antonio Sgorbissa","Francesco Rea","Carmine Recchiuto"],"pdf_url":"https://arxiv.org/pdf/2501.06867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06847v1","updated":"2025-01-12T15:34:20Z","published":"2025-01-12T15:34:20Z","title":"Accelerating Discovery in Natural Science Laboratories with AI and\n Robotics: Perspectives and Challenges from the 2024 IEEE ICRA Workshop,\n Yokohama, Japan","summary":" Science laboratory automation enables accelerated discovery in life sciences\nand materials. However, it requires interdisciplinary collaboration to address\nchallenges such as robust and flexible autonomy, reproducibility, throughput,\nstandardization, the role of human scientists, and ethics. This article\nhighlights these issues, reflecting perspectives from leading experts in\nlaboratory automation across different disciplines of the natural sciences.\n","authors":["Andrew I. Cooper","Patrick Courtney","Kourosh Darvish","Moritz Eckhoff","Hatem Fakhruldeen","Andrea Gabrielli","Animesh Garg","Sami Haddadin","Kanako Harada","Jason Hein","Maria Hübner","Dennis Knobbe","Gabriella Pizzuto","Florian Shkurti","Ruja Shrestha","Kerstin Thurow","Rafael Vescovi","Birgit Vogel-Heuser","Ádám Wolf","Naruki Yoshikawa","Yan Zeng","Zhengxue Zhou","Henning Zwirnmann"],"pdf_url":"https://arxiv.org/pdf/2501.06847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06806v1","updated":"2025-01-12T13:13:04Z","published":"2025-01-12T13:13:04Z","title":"Soft Vision-Based Tactile-Enabled SixthFinger: Advancing Daily Objects\n Manipulation for Stroke Survivors","summary":" The presence of post-stroke grasping deficiencies highlights the critical\nneed for the development and implementation of advanced compensatory\nstrategies. This paper introduces a novel system to aid chronic stroke\nsurvivors through the development of a soft, vision-based, tactile-enabled\nextra robotic finger. By incorporating vision-based tactile sensing, the system\nautonomously adjusts grip force in response to slippage detection. This synergy\nnot only ensures mechanical stability but also enriches tactile feedback,\nmimicking the dynamics of human-object interactions. At the core of our\napproach is a transformer-based framework trained on a comprehensive tactile\ndataset encompassing objects with a wide range of morphological properties,\nincluding variations in shape, size, weight, texture, and hardness.\nFurthermore, we validated the system's robustness in real-world applications,\nwhere it successfully manipulated various everyday objects. The promising\nresults highlight the potential of this approach to improve the quality of life\nfor stroke survivors.\n","authors":["Basma Hasanen","Mashood M. Mohsan","Abdulaziz Y. Alkayas","Federico Renda","Irfan Hussain"],"pdf_url":"https://arxiv.org/pdf/2501.06806v1.pdf","comment":"Robosoft 2025 conference"},{"id":"http://arxiv.org/abs/2501.06783v1","updated":"2025-01-12T11:42:28Z","published":"2025-01-12T11:42:28Z","title":"Cost-Effective Robotic Handwriting System with AI Integration","summary":" This paper introduces a cost-effective robotic handwriting system designed to\nreplicate human-like handwriting with high precision. Combining a Raspberry Pi\nPico microcontroller, 3D-printed components, and a machine learning-based\nhandwriting generation model implemented via TensorFlow.js, the system converts\nuser-supplied text into realistic stroke trajectories. By leveraging\nlightweight 3D-printed materials and efficient mechanical designs, the system\nachieves a total hardware cost of approximately \\$56, significantly\nundercutting commercial alternatives. Experimental evaluations demonstrate\nhandwriting precision within $\\pm$0.3 millimeters and a writing speed of\napproximately 200 mm/min, positioning the system as a viable solution for\neducational, research, and assistive applications. This study seeks to lower\nthe barriers to personalized handwriting technologies, making them accessible\nto a broader audience.\n","authors":["Tianyi Huang","Richard Xiong"],"pdf_url":"https://arxiv.org/pdf/2501.06783v1.pdf","comment":"This is an updated version of a paper originally presented at the\n 2024 IEEE Long Island Systems, Applications and Technology Conference (LISAT)"},{"id":"http://arxiv.org/abs/2403.11639v2","updated":"2025-01-12T09:21:44Z","published":"2024-03-18T10:21:05Z","title":"An Accurate and Real-time Relative Pose Estimation from Triple\n Point-line Images by Decoupling Rotation and Translation","summary":" Line features are valid complements for point features in man-made\nenvironments. 3D-2D constraints provided by line features have been widely used\nin Visual Odometry (VO) and Structure-from-Motion (SfM) systems. However, how\nto accurately solve three-view relative motion only with 2D observations of\npoints and lines in real time has not been fully explored. In this paper, we\npropose a novel three-view pose solver based on rotation-translation decoupled\nestimation. First, a high-precision rotation estimation method based on normal\nvector coplanarity constraints that consider the uncertainty of observations is\nproposed, which can be solved by Levenberg-Marquardt (LM) algorithm\nefficiently. Second, a robust linear translation constraint that minimizes the\ndegree of the rotation components and feature observation components in\nequations is elaborately designed for estimating translations accurately.\nExperiments on synthetic data and real-world data show that the proposed\napproach improves both rotation and translation accuracy compared to the\nclassical trifocal-tensor-based method and the state-of-the-art two-view\nalgorithm in outdoor and indoor environments.\n","authors":["Zewen Xu","Yijia He","Hao Wei","Bo Xu","BinJian Xie","Yihong Wu"],"pdf_url":"https://arxiv.org/pdf/2403.11639v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02444v3","updated":"2025-01-12T08:24:18Z","published":"2024-09-04T04:44:21Z","title":"USV-AUV Collaboration Framework for Underwater Tasks under Extreme Sea\n Conditions","summary":" Autonomous underwater vehicles (AUVs) are valuable for ocean exploration due\nto their flexibility and ability to carry communication and detection units.\nNevertheless, AUVs alone often face challenges in harsh and extreme sea\nconditions. This study introduces a unmanned surface vehicle (USV)-AUV\ncollaboration framework, which includes high-precision multi-AUV positioning\nusing USV path planning via Fisher information matrix optimization and\nreinforcement learning for multi-AUV cooperative tasks. Applied to a multi-AUV\nunderwater data collection task scenario, extensive simulations validate the\nframework's feasibility and superior performance, highlighting exceptional\ncoordination and robustness under extreme sea conditions. To accelerate\nrelevant research in this field, we have made the simulation code (demo\nversion) available as open-source.\n","authors":["Jingzehua Xu","Guanwen Xie","Xinqi Wang","Yimian Ding","Shuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02444v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06719v1","updated":"2025-01-12T05:09:20Z","published":"2025-01-12T05:09:20Z","title":"Hierarchical Sampling-based Planner with LTL Constraints and Text\n Prompting","summary":" This project introduces a hierarchical planner integrating Linear Temporal\nLogic (LTL) constraints with natural language prompting for robot motion\nplanning. The framework decomposes maps into regions, generates directed\ngraphs, and converts them into transition systems for high-level planning. Text\ninstructions are translated into LTL formulas and converted to Deterministic\nFinite Automata (DFA) for sequential goal-reaching tasks while adhering to\nsafety constraints. High-level plans, derived via Breadth-First Search (BFS),\nguide low-level planners like Exploring Random Trees (RRT) and Probabilistic\nRoadmaps (PRM) for obstacle-avoidant navigation along with LTL tasks. The\napproach demonstrates adaptability to various task complexities, though\nchallenges such as graph construction overhead and suboptimal path generation\nremain. Future directions include extending to considering terrain conditions\nand incorporating higher-order dynamics.\n","authors":["Jingzhan Ge","Zi-Hao Zhang","Sheng-En Huang"],"pdf_url":"https://arxiv.org/pdf/2501.06719v1.pdf","comment":"8 pages, 17 figures"},{"id":"http://arxiv.org/abs/2501.06693v1","updated":"2025-01-12T03:01:15Z","published":"2025-01-12T03:01:15Z","title":"Vid2Sim: Realistic and Interactive Simulation from Video for Urban\n Navigation","summary":" Sim-to-real gap has long posed a significant challenge for robot learning in\nsimulation, preventing the deployment of learned models in the real world.\nPrevious work has primarily focused on domain randomization and system\nidentification to mitigate this gap. However, these methods are often limited\nby the inherent constraints of the simulation and graphics engines. In this\nwork, we propose Vid2Sim, a novel framework that effectively bridges the\nsim2real gap through a scalable and cost-efficient real2sim pipeline for neural\n3D scene reconstruction and simulation. Given a monocular video as input,\nVid2Sim can generate photorealistic and physically interactable 3D simulation\nenvironments to enable the reinforcement learning of visual navigation agents\nin complex urban environments. Extensive experiments demonstrate that Vid2Sim\nsignificantly improves the performance of urban navigation in the digital twins\nand real world by 31.2% and 68.3% in success rate compared with agents trained\nwith prior simulation methods.\n","authors":["Ziyang Xie","Zhizheng Liu","Zhenghao Peng","Wayne Wu","Bolei Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.06693v1.pdf","comment":"Project page: https://metadriverse.github.io/vid2sim/"},{"id":"http://arxiv.org/abs/2501.06680v1","updated":"2025-01-12T01:31:07Z","published":"2025-01-12T01:31:07Z","title":"Application of Vision-Language Model to Pedestrians Behavior and Scene\n Understanding in Autonomous Driving","summary":" Autonomous driving (AD) has experienced significant improvements in recent\nyears and achieved promising 3D detection, classification, and localization\nresults. However, many challenges remain, e.g. semantic understanding of\npedestrians' behaviors, and downstream handling for pedestrian interactions.\nRecent studies in applications of Large Language Models (LLM) and\nVision-Language Models (VLM) have achieved promising results in scene\nunderstanding and high-level maneuver planning in diverse traffic scenarios.\nHowever, deploying the billion-parameter LLMs to vehicles requires significant\ncomputation and memory resources. In this paper, we analyzed effective\nknowledge distillation of semantic labels to smaller Vision networks, which can\nbe used for the semantic representation of complex scenes for downstream\ndecision-making for planning and control.\n","authors":["Haoxiang Gao","Yu Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.06680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15908v2","updated":"2025-01-12T01:03:35Z","published":"2024-12-20T13:59:15Z","title":"Speedup Techniques for Switchable Temporal Plan Graph Optimization","summary":" Multi-Agent Path Finding (MAPF) focuses on planning collision-free paths for\nmultiple agents. However, during the execution of a MAPF plan, agents may\nencounter unexpected delays, which can lead to inefficiencies, deadlocks, or\neven collisions. To address these issues, the Switchable Temporal Plan Graph\nprovides a framework for finding an acyclic Temporal Plan Graph with the\nminimum execution cost under delays, ensuring deadlock- and collision-free\nexecution. Unfortunately, existing optimal algorithms, such as Mixed Integer\nLinear Programming and Graph-Based Switchable Edge Search (GSES), are often too\nslow for practical use. This paper introduces Improved GSES, which\nsignificantly accelerates GSES through four speedup techniques: stronger\nadmissible heuristics, edge grouping, prioritized branching, and incremental\nimplementation. Experiments conducted on four different map types with varying\nnumbers of agents demonstrate that Improved GSES consistently achieves over\ntwice the success rate of GSES and delivers up to a 30-fold speedup on\ninstances where both methods successfully find solutions.\n","authors":["He Jiang","Muhan Lin","Jiaoyang Li"],"pdf_url":"https://arxiv.org/pdf/2412.15908v2.pdf","comment":"Accepted by AAAI 2025. This version contains the appendix"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.03397v3","updated":"2025-01-12T23:38:16Z","published":"2025-01-06T21:34:52Z","title":"DoubleDiffusion: Combining Heat Diffusion with Denoising Diffusion for\n Generative Learning on 3D Meshes","summary":" This paper proposes DoubleDiffusion, a novel framework that combines heat\ndissipation diffusion and denoising diffusion for direct generative learning on\n3D mesh surfaces. Our approach addresses the challenges of generating\ncontinuous signal distributions residing on a curve manifold surface. Unlike\nprevious methods that rely on unrolling 3D meshes into 2D or adopting field\nrepresentations, DoubleDiffusion leverages the Laplacian-Beltrami operator to\nprocess features respecting the mesh structure. This combination enables\neffective geometry-aware signal diffusion across the underlying geometry. As\nshown in Fig.1, we demonstrate that DoubleDiffusion has the ability to generate\nRGB signal distributions on complex 3D mesh surfaces and achieves per-category\nshape-conditioned texture generation across different shape geometry. Our work\ncontributes a new direction in diffusion-based generative modeling on 3D\nsurfaces, with potential applications in the field of 3D asset generation.\n","authors":["Xuyang Wang","Ziang Cheng","Zhenyu Li","Jiayu Yang","Haorui Ji","Pan Ji","Mehrtash Harandi","Richard Hartley","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2501.03397v3.pdf","comment":"Codes: https://github.com/Wxyxixixi/DoubleDiffusion_3D_Mesh"},{"id":"http://arxiv.org/abs/2403.15442v3","updated":"2025-01-12T22:16:50Z","published":"2024-03-17T11:28:23Z","title":"Artificial Intelligence for Cochlear Implants: Review of Strategies,\n Challenges, and Perspectives","summary":" Automatic speech recognition (ASR) plays a pivotal role in our daily lives,\noffering utility not only for interacting with machines but also for\nfacilitating communication for individuals with partial or profound hearing\nimpairments. The process involves receiving the speech signal in analog form,\nfollowed by various signal processing algorithms to make it compatible with\ndevices of limited capacities, such as cochlear implants (CIs). Unfortunately,\nthese implants, equipped with a finite number of electrodes, often result in\nspeech distortion during synthesis. Despite efforts by researchers to enhance\nreceived speech quality using various state-of-the-art (SOTA) signal processing\ntechniques, challenges persist, especially in scenarios involving multiple\nsources of speech, environmental noise, and other adverse conditions. The\nadvent of new artificial intelligence (AI) methods has ushered in cutting-edge\nstrategies to address the limitations and difficulties associated with\ntraditional signal processing techniques dedicated to CIs. This review aims to\ncomprehensively cover advancements in CI-based ASR and speech enhancement,\namong other related aspects. The primary objective is to provide a thorough\noverview of metrics and datasets, exploring the capabilities of AI algorithms\nin this biomedical field, and summarizing and commenting on the best results\nobtained. Additionally, the review will delve into potential applications and\nsuggest future directions to bridge existing research gaps in this domain.\n","authors":["Billel Essaid","Hamza Kheddar","Noureddine Batel","Muhammad E. H. Chowdhury","Abderrahmane Lakas"],"pdf_url":"https://arxiv.org/pdf/2403.15442v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06942v1","updated":"2025-01-12T21:39:06Z","published":"2025-01-12T21:39:06Z","title":"Comparison of Autoencoders for tokenization of ASL datasets","summary":" Generative AI, powered by large language models (LLMs), has revolutionized\napplications across text, audio, images, and video. This study focuses on\ndeveloping and evaluating encoder-decoder architectures for the American Sign\nLanguage (ASL) image dataset, consisting of 87,000 images across 29 hand sign\nclasses. Three approaches were compared: Feedforward Autoencoders,\nConvolutional Autoencoders, and Diffusion Autoencoders. The Diffusion\nAutoencoder outperformed the others, achieving the lowest mean squared error\n(MSE) and highest Mean Opinion Score (MOS) due to its probabilistic noise\nmodeling and iterative denoising capabilities. The Convolutional Autoencoder\ndemonstrated effective spatial feature extraction but lacked the robustness of\nthe diffusion process, while the Feedforward Autoencoder served as a baseline\nwith limitations in handling complex image data. Objective and subjective\nevaluations confirmed the superiority of the Diffusion Autoencoder for\nhigh-fidelity image reconstruction, emphasizing its potential in multimodal AI\napplications such as sign language recognition and generation. This work\nprovides critical insights into designing robust encoder-decoder systems to\nadvance multimodal AI capabilities.\n","authors":["Vouk Praun-Petrovic","Aadhvika Koundinya","Lavanya Prahallad"],"pdf_url":"https://arxiv.org/pdf/2501.06942v1.pdf","comment":"9 pages, 2 tables, 4 figures"},{"id":"http://arxiv.org/abs/2501.06939v1","updated":"2025-01-12T21:33:06Z","published":"2025-01-12T21:33:06Z","title":"Super-Resolution of 3D Micro-CT Images Using Generative Adversarial\n Networks: Enhancing Resolution and Segmentation Accuracy","summary":" We develop a procedure for substantially improving the quality of segmented\n3D micro-Computed Tomography (micro-CT) images of rocks with a Machine Learning\n(ML) Generative Model. The proposed model enhances the resolution eightfold\n(8x) and addresses segmentation inaccuracies due to the overlapping X-ray\nattenuation in micro-CT measurement for different rock minerals and phases. The\nproposed generative model is a 3D Deep Convolutional Wasserstein Generative\nAdversarial Network with Gradient Penalty (3D DC WGAN-GP). The algorithm is\ntrained on segmented 3D low-resolution micro-CT images and segmented unpaired\ncomplementary 2D high-resolution Laser Scanning Microscope (LSM) images. The\nalgorithm was demonstrated on multiple samples of Berea sandstones. We achieved\nhigh-quality super-resolved 3D images with a resolution of 0.4375 micro-m/voxel\nand accurate segmentation for constituting minerals and pore space. The\ndescribed procedure can significantly expand the modern capabilities of digital\nrock physics.\n","authors":["Evgeny Ugolkov","Xupeng He","Hyung Kwak","Hussein Hoteit"],"pdf_url":"https://arxiv.org/pdf/2501.06939v1.pdf","comment":"24 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.06938v1","updated":"2025-01-12T21:30:44Z","published":"2025-01-12T21:30:44Z","title":"Evaluating unsupervised contrastive learning framework for MRI sequences\n classification","summary":" The automatic identification of Magnetic Resonance Imaging (MRI) sequences\ncan streamline clinical workflows by reducing the time radiologists spend\nmanually sorting and identifying sequences, thereby enabling faster diagnosis\nand treatment planning for patients. However, the lack of standardization in\nthe parameters of MRI scans poses challenges for automated systems and\ncomplicates the generation and utilization of datasets for machine learning\nresearch. To address this issue, we propose a system for MRI sequence\nidentification using an unsupervised contrastive deep learning framework. By\ntraining a convolutional neural network based on the ResNet-18 architecture,\nour system classifies nine common MRI sequence types as a 9-class\nclassification problem. The network was trained using an in-house internal\ndataset and validated on several public datasets, including BraTS, ADNI, Fused\nRadiology-Pathology Prostate Dataset, the Breast Cancer Dataset (ACRIN), among\nothers, encompassing diverse acquisition protocols and requiring only 2D slices\nfor training. Our system achieves a classification accuracy of over 0.95 across\nthe nine most common MRI sequence types.\n","authors":["Yuli Wang","Kritika Iyer","Sep Farhand","Yoshihisa Shinagawa"],"pdf_url":"https://arxiv.org/pdf/2501.06938v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06927v1","updated":"2025-01-12T20:36:39Z","published":"2025-01-12T20:36:39Z","title":"CULTURE3D: Cultural Landmarks and Terrain Dataset for 3D Applications","summary":" In this paper, we present a large-scale fine-grained dataset using\nhigh-resolution images captured from locations worldwide. Compared to existing\ndatasets, our dataset offers a significantly larger size and includes a higher\nlevel of detail, making it uniquely suited for fine-grained 3D applications.\nNotably, our dataset is built using drone-captured aerial imagery, which\nprovides a more accurate perspective for capturing real-world site layouts and\narchitectural structures. By reconstructing environments with these detailed\nimages, our dataset supports applications such as the COLMAP format for\nGaussian Splatting and the Structure-from-Motion (SfM) method. It is compatible\nwith widely-used techniques including SLAM, Multi-View Stereo, and Neural\nRadiance Fields (NeRF), enabling accurate 3D reconstructions and point clouds.\nThis makes it a benchmark for reconstruction and segmentation tasks. The\ndataset enables seamless integration with multi-modal data, supporting a range\nof 3D applications, from architectural reconstruction to virtual tourism. Its\nflexibility promotes innovation, facilitating breakthroughs in 3D modeling and\nanalysis.\n","authors":["Xinyi Zheng","Steve Zhang","Weizhe Lin","Aaron Zhang","Walterio W. Mayol-Cuevas","Junxiao Shen"],"pdf_url":"https://arxiv.org/pdf/2501.06927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06922v1","updated":"2025-01-12T20:17:46Z","published":"2025-01-12T20:17:46Z","title":"Benchmarking YOLOv8 for Optimal Crack Detection in Civil Infrastructure","summary":" Ensuring the structural integrity and safety of bridges is crucial for the\nreliability of transportation networks and public safety. Traditional crack\ndetection methods are increasingly being supplemented or replaced by advanced\nartificial intelligence (AI) techniques. However, most of the models rely on\ntwo-stage target detection algorithms, which pose concerns for real-time\napplications due to their lower speed. While models such as YOLO (You Only Look\nOnce) have emerged as transformative tools due to their remarkable speed and\naccuracy. However, the potential of the latest YOLOv8 framework in this domain\nremains underexplored. This study bridges that gap by rigorously evaluating\nYOLOv8's performance across five model scales (nano, small, medium, large, and\nextra-large) using a high-quality Roboflow dataset. A comprehensive\nhyperparameter optimization was performed, testing six state-of-the-art\noptimizers-Stochastic Gradient Descent, Adaptive Moment Estimation, Adam with\nDecoupled Weight Decay, Root Mean Square Propagation, Rectified Adam, and\nNesterov-accelerated Adam. Results revealed that YOLOv8, optimized with\nStochastic Gradient Descent, delivered exceptional accuracy and speed, setting\na new benchmark for real-time crack detection. Beyond its immediate\napplication, this research positions YOLOv8 as a foundational approach for\nintegrating advanced computer vision techniques into infrastructure monitoring.\nBy enabling more reliable and proactive maintenance of aging bridge networks,\nthis work paves the way for safer, more efficient transportation systems\nworldwide.\n","authors":["Woubishet Zewdu Taffese","Ritesh Sharma","Mohammad Hossein Afsharmovahed","Gunasekaran Manogaran","Genda Chen"],"pdf_url":"https://arxiv.org/pdf/2501.06922v1.pdf","comment":"Accepted at 104th TRB Annual Meeting 2025"},{"id":"http://arxiv.org/abs/2501.06918v1","updated":"2025-01-12T20:01:07Z","published":"2025-01-12T20:01:07Z","title":"Driver Age and Its Effect on Key Driving Metrics: Insights from Dynamic\n Vehicle Data","summary":" By 2030, the senior population aged 65 and older is expected to increase by\nover 50%, significantly raising the number of older drivers on the road.\nDrivers over 70 face higher crash death rates compared to those in their\nforties and fifties, underscoring the importance of developing more effective\nsafety interventions for this demographic. Although the impact of aging on\ndriving behavior has been studied, there is limited research on how these\nbehaviors translate into real-world driving scenarios. This study addresses\nthis need by leveraging Naturalistic Driving Data (NDD) to analyze driving\nperformance measures - specifically, speed limit adherence on interstates and\ndeceleration at stop intersections, both of which may be influenced by\nage-related declines. Using NDD, we developed Cumulative Distribution Functions\n(CDFs) to establish benchmarks for key driving behaviors among senior and young\ndrivers. Our analysis, which included anomaly detection, benchmark comparisons,\nand accuracy evaluations, revealed significant differences in driving patterns\nprimarily related to speed limit adherence at 75mph. While our approach shows\npromising potential for enhancing Advanced Driver Assistance Systems (ADAS) by\nproviding tailored interventions based on age-specific adherence to speed limit\ndriving patterns, we recognize the need for additional data to refine and\nvalidate metrics for other driving behaviors. By establishing precise\nbenchmarks for various driving performance metrics, ADAS can effectively\nidentify anomalies, such as abrupt deceleration, which may indicate impaired\ndriving or other safety concerns. This study lays a strong foundation for\nfuture research aimed at improving safety interventions through detailed\ndriving behavior analysis.\n","authors":["Aparna Joshi","Kojo Adugyamfi","Jennifer Merickel","Pujitha Gunaratne","Anuj Sharma"],"pdf_url":"https://arxiv.org/pdf/2501.06918v1.pdf","comment":"21 pages, 9 figures, 4 Tables, 104th TRB Annual Meeting 2025,\n Washington DC"},{"id":"http://arxiv.org/abs/2501.06909v1","updated":"2025-01-12T19:45:42Z","published":"2025-01-12T19:45:42Z","title":"Local Foreground Selection aware Attentive Feature Reconstruction for\n few-shot fine-grained plant species classification","summary":" Plant species exhibit significant intra-class variation and minimal\ninter-class variation. To enhance classification accuracy, it is essential to\nreduce intra-class variation while maximizing inter-class variation. This paper\naddresses plant species classification using a limited number of labelled\nsamples and introduces a novel Local Foreground Selection(LFS) attention\nmechanism. LFS is a straightforward module designed to generate discriminative\nsupport and query feature maps. It operates by integrating two types of\nattention: local attention, which captures local spatial details to enhance\nfeature discrimination and increase inter-class differentiation, and foreground\nselection attention, which emphasizes the foreground plant object while\nmitigating background interference. By focusing on the foreground, the query\nand support features selectively highlight relevant feature sequences and\ndisregard less significant background sequences, thereby reducing intra-class\ndifferences. Experimental results from three plant species datasets demonstrate\nthe effectiveness of the proposed LFS attention mechanism and its complementary\nadvantages over previous feature reconstruction methods.\n","authors":["Aisha Zulfiqar","Ebroul Izquiedro"],"pdf_url":"https://arxiv.org/pdf/2501.06909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06903v1","updated":"2025-01-12T19:01:05Z","published":"2025-01-12T19:01:05Z","title":"Synthetic Prior for Few-Shot Drivable Head Avatar Inversion","summary":" We present SynShot, a novel method for the few-shot inversion of a drivable\nhead avatar based on a synthetic prior. We tackle two major challenges. First,\ntraining a controllable 3D generative network requires a large number of\ndiverse sequences, for which pairs of images and high-quality tracked meshes\nare not always available. Second, state-of-the-art monocular avatar models\nstruggle to generalize to new views and expressions, lacking a strong prior and\noften overfitting to a specific viewpoint distribution. Inspired by machine\nlearning models trained solely on synthetic data, we propose a method that\nlearns a prior model from a large dataset of synthetic heads with diverse\nidentities, expressions, and viewpoints. With few input images, SynShot\nfine-tunes the pretrained synthetic prior to bridge the domain gap, modeling a\nphotorealistic head avatar that generalizes to novel expressions and\nviewpoints. We model the head avatar using 3D Gaussian splatting and a\nconvolutional encoder-decoder that outputs Gaussian parameters in UV texture\nspace. To account for the different modeling complexities over parts of the\nhead (e.g., skin vs hair), we embed the prior with explicit control for\nupsampling the number of per-part primitives. Compared to state-of-the-art\nmonocular methods that require thousands of real training images, SynShot\nsignificantly improves novel view and expression synthesis.\n","authors":["Wojciech Zielonka","Stephan J. Garbin","Alexandros Lattas","George Kopanas","Paulo Gotardo","Thabo Beeler","Justus Thies","Timo Bolkart"],"pdf_url":"https://arxiv.org/pdf/2501.06903v1.pdf","comment":"Website https://zielon.github.io/synshot/"},{"id":"http://arxiv.org/abs/2501.06897v1","updated":"2025-01-12T18:38:51Z","published":"2025-01-12T18:38:51Z","title":"ActiveGAMER: Active GAussian Mapping through Efficient Rendering","summary":" We introduce ActiveGAMER, an active mapping system that utilizes 3D Gaussian\nSplatting (3DGS) to achieve high-quality, real-time scene mapping and\nexploration. Unlike traditional NeRF-based methods, which are computationally\ndemanding and restrict active mapping performance, our approach leverages the\nefficient rendering capabilities of 3DGS, allowing effective and efficient\nexploration in complex environments. The core of our system is a\nrendering-based information gain module that dynamically identifies the most\ninformative viewpoints for next-best-view planning, enhancing both geometric\nand photometric reconstruction accuracy. ActiveGAMER also integrates a\ncarefully balanced framework, combining coarse-to-fine exploration,\npost-refinement, and a global-local keyframe selection strategy to maximize\nreconstruction completeness and fidelity. Our system autonomously explores and\nreconstructs environments with state-of-the-art geometric and photometric\naccuracy and completeness, significantly surpassing existing approaches in both\naspects. Extensive evaluations on benchmark datasets such as Replica and MP3D\nhighlight ActiveGAMER's effectiveness in active mapping tasks.\n","authors":["Liyan Chen","Huangying Zhan","Kevin Chen","Xiangyu Xu","Qingan Yan","Changjiang Cai","Yi Xu"],"pdf_url":"https://arxiv.org/pdf/2501.06897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06887v1","updated":"2025-01-12T17:50:47Z","published":"2025-01-12T17:50:47Z","title":"MedGrad E-CLIP: Enhancing Trust and Transparency in AI-Driven Skin\n Lesion Diagnosis","summary":" As deep learning models gain attraction in medical data, ensuring transparent\nand trustworthy decision-making is essential. In skin cancer diagnosis, while\nadvancements in lesion detection and classification have improved accuracy, the\nblack-box nature of these methods poses challenges in understanding their\ndecision processes, leading to trust issues among physicians. This study\nleverages the CLIP (Contrastive Language-Image Pretraining) model, trained on\ndifferent skin lesion datasets, to capture meaningful relationships between\nvisual features and diagnostic criteria terms. To further enhance transparency,\nwe propose a method called MedGrad E-CLIP, which builds on gradient-based\nE-CLIP by incorporating a weighted entropy mechanism designed for complex\nmedical imaging like skin lesions. This approach highlights critical image\nregions linked to specific diagnostic descriptions. The developed integrated\npipeline not only classifies skin lesions by matching corresponding\ndescriptions but also adds an essential layer of explainability developed\nespecially for medical data. By visually explaining how different features in\nan image relates to diagnostic criteria, this approach demonstrates the\npotential of advanced vision-language models in medical image analysis,\nultimately improving transparency, robustness, and trust in AI-driven\ndiagnostic systems.\n","authors":["Sadia Kamal","Tim Oates"],"pdf_url":"https://arxiv.org/pdf/2501.06887v1.pdf","comment":"Accepted to 2025 IEEE/CVF Winter Conference on Applications of\n Computer Vision Workshops (WACVW)"},{"id":"http://arxiv.org/abs/2501.06884v1","updated":"2025-01-12T17:41:23Z","published":"2025-01-12T17:41:23Z","title":"Transforming Vision Transformer: Towards Efficient Multi-Task\n Asynchronous Learning","summary":" Multi-Task Learning (MTL) for Vision Transformer aims at enhancing the model\ncapability by tackling multiple tasks simultaneously. Most recent works have\npredominantly focused on designing Mixture-of-Experts (MoE) structures and in\ntegrating Low-Rank Adaptation (LoRA) to efficiently perform multi-task\nlearning. However, their rigid combination hampers both the optimization of MoE\nand the ef fectiveness of reparameterization of LoRA, leading to sub-optimal\nperformance and low inference speed. In this work, we propose a novel approach\ndubbed Efficient Multi-Task Learning (EMTAL) by transforming a pre-trained\nVision Transformer into an efficient multi-task learner during training, and\nreparameterizing the learned structure for efficient inference. Specifically,\nwe firstly develop the MoEfied LoRA structure, which decomposes the pre-trained\nTransformer into a low-rank MoE structure and employ LoRA to fine-tune the\nparameters. Subsequently, we take into account the intrinsic asynchronous\nnature of multi-task learning and devise a learning Quality Retaining (QR)\noptimization mechanism, by leveraging the historical high-quality class logits\nto prevent a well-trained task from performance degradation. Finally, we design\na router fading strategy to integrate the learned parameters into the original\nTransformer, archiving efficient inference. Extensive experiments on public\nbenchmarks demonstrate the superiority of our method, compared to the\nstate-of-the-art multi-task learning approaches.\n","authors":["Hanwen Zhong","Jiaxin Chen","Yutong Zhang","Di Huang","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2501.06884v1.pdf","comment":"Accepted by the 38th Conference on Neural Information Processing\n Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2501.06880v1","updated":"2025-01-12T17:28:09Z","published":"2025-01-12T17:28:09Z","title":"Real-Time Neural-Enhancement for Online Cloud Gaming","summary":" Online Cloud gaming demands real-time, high-quality video transmission across\nvariable wide-area networks (WANs). Neural-enhanced video transmission\nalgorithms employing super-resolution (SR) for video quality enhancement have\neffectively challenged WAN environments. However, these SR-based methods\nrequire intensive fine-tuning for the whole video, making it infeasible in\ndiverse online cloud gaming. To address this, we introduce River, a cloud\ngaming delivery framework designed based on the observation that video segment\nfeatures in cloud gaming are typically repetitive and redundant. This permits a\nsignificant opportunity to reuse fine-tuned SR models, reducing the fine-tuning\nlatency of minutes to query latency of milliseconds. To enable the idea, we\ndesign a practical system that addresses several challenges, such as model\norganization, online model scheduler, and transfer strategy. River first builds\na content-aware encoder that fine-tunes SR models for diverse video segments\nand stores them in a lookup table. When delivering cloud gaming video streams\nonline, River checks the video features and retrieves the most relevant SR\nmodels to enhance the frame quality. Meanwhile, if no existing SR model\nperforms well enough for some video segments, River will further fine-tune new\nmodels and update the lookup table. Finally, to avoid the overhead of streaming\nmodel weight to the clients, River designs a prefetching strategy that predicts\nthe models with the highest possibility of being retrieved. Our evaluation\nbased on real video game streaming demonstrates River can reduce redundant\ntraining overhead by 44% and improve the Peak-Signal-to-Noise-Ratio by 1.81dB\ncompared to the SOTA solutions. Practical deployment shows River meets\nreal-time requirements, achieving approximately 720p 20fps on mobile devices.\n","authors":["Shan Jiang","Zhenhua Han","Haisheng Tan","Xinyang Jiang","Yifan Yang","Xiaoxi Zhang","Hongqiu Ni","Yuqing Yang","Xiang-Yang Li"],"pdf_url":"https://arxiv.org/pdf/2501.06880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06879v1","updated":"2025-01-12T17:26:24Z","published":"2025-01-12T17:26:24Z","title":"Defect Detection Network In PCB Circuit Devices Based on GAN Enhanced\n YOLOv11","summary":" This study proposes an advanced method for surface defect detection in\nprinted circuit boards (PCBs) using an improved YOLOv11 model enhanced with a\ngenerative adversarial network (GAN). The approach focuses on identifying six\ncommon defect types: missing hole, rat bite, open circuit, short circuit, burr,\nand virtual welding. By employing GAN to generate synthetic defect images, the\ndataset is augmented with diverse and realistic patterns, improving the model's\nability to generalize, particularly for complex and infrequent defects like\nburrs. The enhanced YOLOv11 model is evaluated on a PCB defect dataset,\ndemonstrating significant improvements in accuracy, recall, and robustness,\nespecially when dealing with defects in complex environments or small targets.\nThis research contributes to the broader field of electronic design automation\n(EDA), where efficient defect detection is a crucial step in ensuring\nhigh-quality PCB manufacturing. By integrating advanced deep learning\ntechniques, this approach enhances the automation and precision of defect\ndetection, reducing reliance on manual inspection and accelerating\ndesign-to-production workflows. The findings underscore the importance of\nincorporating GAN-based data augmentation and optimized detection architectures\nin EDA processes, providing valuable insights for improving reliability and\nefficiency in PCB defect detection within industrial applications.\n","authors":["Jiayi Huang","Feiyun Zhao","Lieyang Chen"],"pdf_url":"https://arxiv.org/pdf/2501.06879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06878v1","updated":"2025-01-12T17:24:51Z","published":"2025-01-12T17:24:51Z","title":"Uncertainty-Aware Online Extrinsic Calibration: A Conformal Prediction\n Approach","summary":" Accurate sensor calibration is crucial for autonomous systems, yet its\nuncertainty quantification remains underexplored. We present the first approach\nto integrate uncertainty awareness into online extrinsic calibration, combining\nMonte Carlo Dropout with Conformal Prediction to generate prediction intervals\nwith a guaranteed level of coverage. Our method proposes a framework to enhance\nexisting calibration models with uncertainty quantification, compatible with\nvarious network architectures. Validated on KITTI (RGB Camera-LiDAR) and DSEC\n(Event Camera-LiDAR) datasets, we demonstrate effectiveness across different\nvisual sensor types, measuring performance with adapted metrics to evaluate the\nefficiency and reliability of the intervals. By providing calibration\nparameters with quantifiable confidence measures, we offer insights into the\nreliability of calibration estimates, which can greatly improve the robustness\nof sensor fusion in dynamic environments and usefully serve the Computer Vision\ncommunity.\n","authors":["Mathieu Cocheteux","Julien Moreau","Franck Davoine"],"pdf_url":"https://arxiv.org/pdf/2501.06878v1.pdf","comment":"Accepted for publication at WACV 2025"},{"id":"http://arxiv.org/abs/2411.17922v3","updated":"2025-01-12T16:50:07Z","published":"2024-11-26T22:31:09Z","title":"Exploring Superpixel Segmentation Methods in the Context of Citizen\n Science and Deforestation Detection","summary":" Tropical forests play an essential role in the planet's ecosystem, making the\nconservation of these biomes a worldwide priority. However, ongoing\ndeforestation and degradation pose a significant threat to their existence,\nnecessitating effective monitoring and the proposal of actions to mitigate the\ndamage caused by these processes. In this regard, initiatives range from\ngovernment and private sector monitoring programs to solutions based on citizen\nscience campaigns, for example. Particularly in the context of citizen science\ncampaigns, the segmentation of remote sensing images to identify deforested\nareas and subsequently submit them to analysis by non-specialized volunteers is\nnecessary. Thus, segmentation using superpixel-based techniques proves to be a\nviable solution for this important task. Therefore, this paper presents an\nanalysis of 22 superpixel-based segmentation methods applied to remote sensing\nimages, aiming to identify which of them are more suitable for generating\nsegments for citizen science campaigns. The results reveal that seven of the\nsegmentation methods outperformed the baseline method (SLIC) currently employed\nin the ForestEyes citizen science project, indicating an opportunity for\nimprovement in this important stage of campaign development.\n","authors":["Hugo Resende","Isabela Borlido","Victor Sundermann","Eduardo B. Neto","Silvio Jamil F. Guimarães","Fabio Faria","Alvaro Luiz Fazenda"],"pdf_url":"https://arxiv.org/pdf/2411.17922v3.pdf","comment":"This paper is under review"},{"id":"http://arxiv.org/abs/2501.06869v1","updated":"2025-01-12T16:39:13Z","published":"2025-01-12T16:39:13Z","title":"A Foundational Generative Model for Breast Ultrasound Image Analysis","summary":" Foundational models have emerged as powerful tools for addressing various\ntasks in clinical settings. However, their potential development to breast\nultrasound analysis remains untapped. In this paper, we present BUSGen, the\nfirst foundational generative model specifically designed for breast ultrasound\nimage analysis. Pretrained on over 3.5 million breast ultrasound images, BUSGen\nhas acquired extensive knowledge of breast structures, pathological features,\nand clinical variations. With few-shot adaptation, BUSGen can generate\nrepositories of realistic and informative task-specific data, facilitating the\ndevelopment of models for a wide range of downstream tasks. Extensive\nexperiments highlight BUSGen's exceptional adaptability, significantly\nexceeding real-data-trained foundational models in breast cancer screening,\ndiagnosis, and prognosis. In breast cancer early diagnosis, our approach\noutperformed all board-certified radiologists (n=9), achieving an average\nsensitivity improvement of 16.5% (P-value<0.0001). Additionally, we\ncharacterized the scaling effect of using generated data which was as effective\nas the collected real-world data for training diagnostic models. Moreover,\nextensive experiments demonstrated that our approach improved the\ngeneralization ability of downstream models. Importantly, BUSGen protected\npatient privacy by enabling fully de-identified data sharing, making progress\nforward in secure medical data utilization. An online demo of BUSGen is\navailable at https://aibus.bio.\n","authors":["Haojun Yu","Youcheng Li","Nan Zhang","Zihan Niu","Xuantong Gong","Yanwen Luo","Haotian Ye","Siyu He","Quanlin Wu","Wangyan Qin","Mengyuan Zhou","Jie Han","Jia Tao","Ziwei Zhao","Di Dai","Di He","Dong Wang","Binghui Tang","Ling Huo","James Zou","Qingli Zhu","Yong Wang","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2501.06869v1.pdf","comment":"Peking University; Stanford University; Peking University Cancer\n Hospital & Institute; Peking Union Medical College Hospital; Cancer Hospital,\n Chinese Academy of Medical Sciences"},{"id":"http://arxiv.org/abs/2501.06862v1","updated":"2025-01-12T16:22:17Z","published":"2025-01-12T16:22:17Z","title":"LarvSeg: Exploring Image Classification Data For Large Vocabulary\n Semantic Segmentation via Category-wise Attentive Classifier","summary":" Scaling up the vocabulary of semantic segmentation models is extremely\nchallenging because annotating large-scale mask labels is labour-intensive and\ntime-consuming. Recently, language-guided segmentation models have been\nproposed to address this challenge. However, their performance drops\nsignificantly when applied to out-of-distribution categories. In this paper, we\npropose a new large vocabulary semantic segmentation framework, called LarvSeg.\nDifferent from previous works, LarvSeg leverages image classification data to\nscale the vocabulary of semantic segmentation models as large-vocabulary\nclassification datasets usually contain balanced categories and are much easier\nto obtain. However, for classification tasks, the category is image-level,\nwhile for segmentation we need to predict the label at pixel level. To address\nthis issue, we first propose a general baseline framework to incorporate\nimage-level supervision into the training process of a pixel-level segmentation\nmodel, making the trained network perform semantic segmentation on newly\nintroduced categories in the classification data. We then observe that a model\ntrained on segmentation data can group pixel features of categories beyond the\ntraining vocabulary. Inspired by this finding, we design a category-wise\nattentive classifier to apply supervision to the precise regions of\ncorresponding categories to improve the model performance. Extensive\nexperiments demonstrate that LarvSeg significantly improves the large\nvocabulary semantic segmentation performance, especially in the categories\nwithout mask labels. For the first time, we provide a 21K-category semantic\nsegmentation model with the help of ImageNet21K. The code is available at\nhttps://github.com/HaojunYu1998/large_voc_seg.\n","authors":["Haojun Yu","Di Dai","Ziwei Zhao","Di He","Han Hu","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2501.06862v1.pdf","comment":"PRCV 2024"},{"id":"http://arxiv.org/abs/2501.02198v2","updated":"2025-01-12T15:42:20Z","published":"2025-01-04T05:20:53Z","title":"Fresh-CL: Feature Realignment through Experts on Hypersphere in\n Continual Learning","summary":" Continual Learning enables models to learn and adapt to new tasks while\nretaining prior knowledge. Introducing new tasks, however, can naturally lead\nto feature entanglement across tasks, limiting the model's capability to\ndistinguish between new domain data. In this work, we propose a method called\nFeature Realignment through Experts on hyperSpHere in Continual Learning\n(Fresh-CL). By leveraging predefined and fixed simplex equiangular tight frame\n(ETF) classifiers on a hypersphere, our model improves feature separation both\nintra and inter tasks. However, the projection to a simplex ETF shifts with new\ntasks, disrupting structured feature representation of previous tasks and\ndegrading performance. Therefore, we propose a dynamic extension of ETF through\nmixture of experts, enabling adaptive projections onto diverse subspaces to\nenhance feature representation. Experiments on 11 datasets demonstrate a 2%\nimprovement in accuracy compared to the strongest baseline, particularly in\nfine-grained datasets, confirming the efficacy of combining ETF and MoE to\nimprove feature distinction in continual learning scenarios.\n","authors":["Zhongyi Zhou","Yaxin Peng","Pin Yi","Minjie Zhu","Chaomin Shen"],"pdf_url":"https://arxiv.org/pdf/2501.02198v2.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.06848v1","updated":"2025-01-12T15:34:24Z","published":"2025-01-12T15:34:24Z","title":"A General Framework for Inference-time Scaling and Steering of Diffusion\n Models","summary":" Diffusion models produce impressive results in modalities ranging from images\nand video to protein design and text. However, generating samples with\nuser-specified properties remains a challenge. Recent research proposes\nfine-tuning models to maximize rewards that capture desired properties, but\nthese methods require expensive training and are prone to mode collapse. In\nthis work, we propose Feynman Kac (FK) steering, an inference-time framework\nfor steering diffusion models with reward functions. FK steering works by\nsampling a system of multiple interacting diffusion processes, called\nparticles, and resampling particles at intermediate steps based on scores\ncomputed using functions called potentials. Potentials are defined using\nrewards for intermediate states and are selected such that a high value\nindicates that the particle will yield a high-reward sample. We explore various\nchoices of potentials, intermediate rewards, and samplers. We evaluate FK\nsteering on text-to-image and text diffusion models. For steering text-to-image\nmodels with a human preference reward, we find that FK steering a 0.8B\nparameter model outperforms a 2.6B parameter fine-tuned model on prompt\nfidelity, with faster sampling and no training. For steering text diffusion\nmodels with rewards for text quality and specific text attributes, we find that\nFK steering generates lower perplexity, more linguistically acceptable outputs\nand enables gradient-free control of attributes like toxicity. Our results\ndemonstrate that inference-time scaling and steering of diffusion models, even\nwith off-the-shelf rewards, can provide significant sample quality gains and\ncontrollability benefits. Code is available at\nhttps://github.com/zacharyhorvitz/Fk-Diffusion-Steering .\n","authors":["Raghav Singhal","Zachary Horvitz","Ryan Teehan","Mengye Ren","Zhou Yu","Kathleen McKeown","Rajesh Ranganath"],"pdf_url":"https://arxiv.org/pdf/2501.06848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04016v2","updated":"2025-01-12T15:24:23Z","published":"2024-07-04T15:46:01Z","title":"Mitigating Low-Frequency Bias: Feature Recalibration and Frequency\n Attention Regularization for Adversarial Robustness","summary":" Ensuring the robustness of deep neural networks against adversarial attacks\nremains a fundamental challenge in computer vision. While adversarial training\n(AT) has emerged as a promising defense strategy, our analysis reveals a\ncritical limitation: AT-trained models exhibit a bias toward low-frequency\nfeatures while neglecting high-frequency components. This bias is particularly\nconcerning as each frequency component carries distinct and crucial\ninformation: low-frequency features encode fundamental structural patterns,\nwhile high-frequency features capture intricate details and textures. To\naddress this limitation, we propose High-Frequency Feature Disentanglement and\nRecalibration (HFDR), a novel module that strategically separates and\nrecalibrates frequency-specific features to capture latent semantic cues. We\nfurther introduce frequency attention regularization to harmonize feature\nextraction across the frequency spectrum and mitigate the inherent\nlow-frequency bias of AT. Extensive experiments demonstrate our method's\nsuperior performance against white-box attacks and transfer attacks, while\nexhibiting strong generalization capabilities across diverse scenarios.\n","authors":["Kejia Zhang","Juanjuan Weng","Yuanzheng Cai","Zhiming Luo","Shaozi Li"],"pdf_url":"https://arxiv.org/pdf/2407.04016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06841v1","updated":"2025-01-12T15:18:31Z","published":"2025-01-12T15:18:31Z","title":"Faithful Counterfactual Visual Explanations (FCVE)","summary":" Deep learning models in computer vision have made remarkable progress, but\ntheir lack of transparency and interpretability remains a challenge. The\ndevelopment of explainable AI can enhance the understanding and performance of\nthese models. However, existing techniques often struggle to provide convincing\nexplanations that non-experts easily understand, and they cannot accurately\nidentify models' intrinsic decision-making processes. To address these\nchallenges, we propose to develop a counterfactual explanation (CE) model that\nbalances plausibility and faithfulness. This model generates easy-to-understand\nvisual explanations by making minimum changes necessary in images without\naltering the pixel data. Instead, the proposed method identifies internal\nconcepts and filters learned by models and leverages them to produce plausible\ncounterfactual explanations. The provided explanations reflect the internal\ndecision-making process of the model, thus ensuring faithfulness to the model.\n","authors":["Bismillah Khan","Syed Ali Tariq","Tehseen Zia","Muhammad Ahsan","David Windridge"],"pdf_url":"https://arxiv.org/pdf/2501.06841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03880v2","updated":"2025-01-12T15:18:28Z","published":"2025-01-07T15:43:36Z","title":"SELMA3D challenge: Self-supervised learning for 3D light-sheet\n microscopy image segmentation","summary":" Recent innovations in light sheet microscopy, paired with developments in\ntissue clearing techniques, enable the 3D imaging of large mammalian tissues\nwith cellular resolution. Combined with the progress in large-scale data\nanalysis, driven by deep learning, these innovations empower researchers to\nrapidly investigate the morphological and functional properties of diverse\nbiological samples. Segmentation, a crucial preliminary step in the analysis\nprocess, can be automated using domain-specific deep learning models with\nexpert-level performance. However, these models exhibit high sensitivity to\ndomain shifts, leading to a significant drop in accuracy when applied to data\noutside their training distribution. To address this limitation, and inspired\nby the recent success of self-supervised learning in training generalizable\nmodels, we organized the SELMA3D Challenge during the MICCAI 2024 conference.\nSELMA3D provides a vast collection of light-sheet images from cleared mice and\nhuman brains, comprising 35 large 3D images-each with over 1000^3 voxels-and\n315 annotated small patches for finetuning, preliminary testing and final\ntesting. The dataset encompasses diverse biological structures, including\nvessel-like and spot-like structures. Five teams participated in all phases of\nthe challenge, and their proposed methods are reviewed in this paper.\nQuantitative and qualitative results from most participating teams demonstrate\nthat self-supervised learning on large datasets improves segmentation model\nperformance and generalization. We will continue to support and extend SELMA3D\nas an inaugural MICCAI challenge focused on self-supervised learning for 3D\nmicroscopy image segmentation.\n","authors":["Ying Chen","Rami Al-Maskari","Izabela Horvath","Mayar Ali","Luciano Hoher","Kaiyuan Yang","Zengming Lin","Zhiwei Zhai","Mengzhe Shen","Dejin Xun","Yi Wang","Tony Xu","Maged Goubran","Yunheng Wu","Kensaku Mori","Johannes C. Paetzold","Ali Erturk"],"pdf_url":"https://arxiv.org/pdf/2501.03880v2.pdf","comment":"2st version"}]},"2025-01-14T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.08286v1","updated":"2025-01-14T18:01:15Z","published":"2025-01-14T18:01:15Z","title":"VINGS-Mono: Visual-Inertial Gaussian Splatting Monocular SLAM in Large\n Scenes","summary":" VINGS-Mono is a monocular (inertial) Gaussian Splatting (GS) SLAM framework\ndesigned for large scenes. The framework comprises four main components: VIO\nFront End, 2D Gaussian Map, NVS Loop Closure, and Dynamic Eraser. In the VIO\nFront End, RGB frames are processed through dense bundle adjustment and\nuncertainty estimation to extract scene geometry and poses. Based on this\noutput, the mapping module incrementally constructs and maintains a 2D Gaussian\nmap. Key components of the 2D Gaussian Map include a Sample-based Rasterizer,\nScore Manager, and Pose Refinement, which collectively improve mapping speed\nand localization accuracy. This enables the SLAM system to handle large-scale\nurban environments with up to 50 million Gaussian ellipsoids. To ensure global\nconsistency in large-scale scenes, we design a Loop Closure module, which\ninnovatively leverages the Novel View Synthesis (NVS) capabilities of Gaussian\nSplatting for loop closure detection and correction of the Gaussian map.\nAdditionally, we propose a Dynamic Eraser to address the inevitable presence of\ndynamic objects in real-world outdoor scenes. Extensive evaluations in indoor\nand outdoor environments demonstrate that our approach achieves localization\nperformance on par with Visual-Inertial Odometry while surpassing recent\nGS/NeRF SLAM methods. It also significantly outperforms all existing methods in\nterms of mapping and rendering quality. Furthermore, we developed a mobile app\nand verified that our framework can generate high-quality Gaussian maps in real\ntime using only a smartphone camera and a low-frequency IMU sensor. To the best\nof our knowledge, VINGS-Mono is the first monocular Gaussian SLAM method\ncapable of operating in outdoor environments and supporting kilometer-scale\nlarge scenes.\n","authors":["Ke Wu","Zicheng Zhang","Muer Tie","Ziqing Ai","Zhongxue Gan","Wenchao Ding"],"pdf_url":"https://arxiv.org/pdf/2501.08286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07571v3","updated":"2025-01-14T17:33:46Z","published":"2024-09-11T18:58:16Z","title":"FaVoR: Features via Voxel Rendering for Camera Relocalization","summary":" Camera relocalization methods range from dense image alignment to direct\ncamera pose regression from a query image. Among these, sparse feature matching\nstands out as an efficient, versatile, and generally lightweight approach with\nnumerous applications. However, feature-based methods often struggle with\nsignificant viewpoint and appearance changes, leading to matching failures and\ninaccurate pose estimates. To overcome this limitation, we propose a novel\napproach that leverages a globally sparse yet locally dense 3D representation\nof 2D features. By tracking and triangulating landmarks over a sequence of\nframes, we construct a sparse voxel map optimized to render image patch\ndescriptors observed during tracking. Given an initial pose estimate, we first\nsynthesize descriptors from the voxels using volumetric rendering and then\nperform feature matching to estimate the camera pose. This methodology enables\nthe generation of descriptors for unseen views, enhancing robustness to view\nchanges. We extensively evaluate our method on the 7-Scenes and Cambridge\nLandmarks datasets. Our results show that our method significantly outperforms\nexisting state-of-the-art feature representation techniques in indoor\nenvironments, achieving up to a 39% improvement in median translation error.\nAdditionally, our approach yields comparable results to other methods for\noutdoor scenarios while maintaining lower memory and computational costs.\n","authors":["Vincenzo Polizzi","Marco Cannici","Davide Scaramuzza","Jonathan Kelly"],"pdf_url":"https://arxiv.org/pdf/2409.07571v3.pdf","comment":"Accepted to the IEEE/CVF Winter Conference on Applications of\n Computer Vision (WACV), Tucson, Arizona, US, Feb 28-Mar 4, 2025"},{"id":"http://arxiv.org/abs/2501.06693v2","updated":"2025-01-14T17:29:06Z","published":"2025-01-12T03:01:15Z","title":"Vid2Sim: Realistic and Interactive Simulation from Video for Urban\n Navigation","summary":" Sim-to-real gap has long posed a significant challenge for robot learning in\nsimulation, preventing the deployment of learned models in the real world.\nPrevious work has primarily focused on domain randomization and system\nidentification to mitigate this gap. However, these methods are often limited\nby the inherent constraints of the simulation and graphics engines. In this\nwork, we propose Vid2Sim, a novel framework that effectively bridges the\nsim2real gap through a scalable and cost-efficient real2sim pipeline for neural\n3D scene reconstruction and simulation. Given a monocular video as input,\nVid2Sim can generate photorealistic and physically interactable 3D simulation\nenvironments to enable the reinforcement learning of visual navigation agents\nin complex urban environments. Extensive experiments demonstrate that Vid2Sim\nsignificantly improves the performance of urban navigation in the digital twins\nand real world by 31.2% and 68.3% in success rate compared with agents trained\nwith prior simulation methods.\n","authors":["Ziyang Xie","Zhizheng Liu","Zhenghao Peng","Wayne Wu","Bolei Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.06693v2.pdf","comment":"Project page: https://metadriverse.github.io/vid2sim/"},{"id":"http://arxiv.org/abs/2501.08259v1","updated":"2025-01-14T17:15:27Z","published":"2025-01-14T17:15:27Z","title":"FDPP: Fine-tune Diffusion Policy with Human Preference","summary":" Imitation learning from human demonstrations enables robots to perform\ncomplex manipulation tasks and has recently witnessed huge success. However,\nthese techniques often struggle to adapt behavior to new preferences or changes\nin the environment. To address these limitations, we propose Fine-tuning\nDiffusion Policy with Human Preference (FDPP). FDPP learns a reward function\nthrough preference-based learning. This reward is then used to fine-tune the\npre-trained policy with reinforcement learning (RL), resulting in alignment of\npre-trained policy with new human preferences while still solving the original\ntask. Our experiments across various robotic tasks and preferences demonstrate\nthat FDPP effectively customizes policy behavior without compromising\nperformance. Additionally, we show that incorporating Kullback-Leibler (KL)\nregularization during fine-tuning prevents over-fitting and helps maintain the\ncompetencies of the initial policy.\n","authors":["Yuxin Chen","Devesh K. Jha","Masayoshi Tomizuka","Diego Romeres"],"pdf_url":"https://arxiv.org/pdf/2501.08259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08222v1","updated":"2025-01-14T16:05:32Z","published":"2025-01-14T16:05:32Z","title":"Data-driven Spatial Classification using Multi-Arm Bandits for\n Monitoring with Energy-Constrained Mobile Robots","summary":" We consider the spatial classification problem for monitoring using data\ncollected by a coordinated team of mobile robots. Such classification problems\narise in several applications including search-and-rescue and precision\nagriculture. Specifically, we want to classify the regions of a search\nenvironment into interesting and uninteresting as quickly as possible using a\nteam of mobile sensors and mobile charging stations. We develop a data-driven\nstrategy that accommodates the noise in sensed data and the limited energy\ncapacity of the sensors, and generates collision-free motion plans for the\nteam. We propose a bi-level approach, where a high-level planner leverages a\nmulti-armed bandit framework to determine the potential regions of interest for\nthe drones to visit next based on the data collected online. Then, a low-level\npath planner based on integer programming coordinates the paths for the team to\nvisit the target regions subject to the physical constraints. We characterize\nseveral theoretical properties of the proposed approach, including anytime\nguarantees and task completion time. We show the efficacy of our approach in\nsimulation, and further validate these observations in physical experiments\nusing mobile robots.\n","authors":["Xiaoshan Lin","Siddharth Nayak","Stefano Di Cairano","Abraham P. Vinod"],"pdf_url":"https://arxiv.org/pdf/2501.08222v1.pdf","comment":"8 pages, 6 figures. See https://www.youtube.com/watch?v=gzulpOcVYzg\n for an overview of the approach along with videos of the hardware experiments"},{"id":"http://arxiv.org/abs/2501.08096v1","updated":"2025-01-14T13:10:13Z","published":"2025-01-14T13:10:13Z","title":"Hybrid Action Based Reinforcement Learning for Multi-Objective\n Compatible Autonomous Driving","summary":" Reinforcement Learning (RL) has shown excellent performance in solving\ndecision-making and control problems of autonomous driving, which is\nincreasingly applied in diverse driving scenarios. However, driving is a\nmulti-attribute problem, leading to challenges in achieving multi-objective\ncompatibility for current RL methods, especially in both policy execution and\npolicy iteration. On the one hand, the common action space structure with\nsingle action type limits driving flexibility or results in large behavior\nfluctuations during policy execution. On the other hand, the multi-attribute\nweighted single reward function result in the agent's disproportionate\nattention to certain objectives during policy iterations. To this end, we\npropose a Multi-objective Ensemble-Critic reinforcement learning method with\nHybrid Parametrized Action for multi-objective compatible autonomous driving.\nSpecifically, a parameterized action space is constructed to generate hybrid\ndriving actions, combining both abstract guidance and concrete control\ncommands. A multi-objective critics architecture is constructed considering\nmultiple attribute rewards, to ensure simultaneously focusing on different\ndriving objectives. Additionally, uncertainty-based exploration strategy is\nintroduced to help the agent faster approach viable driving policy. The\nexperimental results in both the simulated traffic environment and the HighD\ndataset demonstrate that our method can achieve multi-objective compatible\nautonomous driving in terms of driving efficiency, action consistency, and\nsafety. It enhances the general performance of the driving while significantly\nincreasing training efficiency.\n","authors":["Guizhe Jin","Zhuoren Li","Bo Leng","Wei Han","Lu Xiong","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2501.08096v1.pdf","comment":"12 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2501.08077v1","updated":"2025-01-14T12:44:33Z","published":"2025-01-14T12:44:33Z","title":"HydroelasticTouch: Simulation of Tactile Sensors with Hydroelastic\n Contact Surfaces","summary":" Thanks to recent advancements in the development of inexpensive,\nhigh-resolution tactile sensors, touch sensing has become popular in\ncontact-rich robotic manipulation tasks. With the surge of data-driven methods\nand their requirement for substantial datasets, several methods of simulating\ntactile sensors have emerged in the tactile research community to overcome\nreal-world data collection limitations. These simulation approaches can be\nsplit into two main categories: fast but inaccurate (soft) point-contact models\nand slow but accurate finite element modeling. In this work, we present a novel\napproach to simulating pressure-based tactile sensors using the hydroelastic\ncontact model, which provides a high degree of physical realism at a reasonable\ncomputational cost. This model produces smooth contact forces for soft-to-soft\nand soft-to-rigid contacts along even non-convex contact surfaces. Pressure\nvalues are approximated at each point of the contact surface and can be\nintegrated to calculate sensor outputs. We validate our models' capacity to\nsynthesize real-world tactile data by conducting zero-shot sim-to-real transfer\nof a model for object state estimation. Our simulation is available as a\nplug-in to our open-source, MuJoCo-based simulator.\n","authors":["David P. Leins","Florian Patzelt","Robert Haschke"],"pdf_url":"https://arxiv.org/pdf/2501.08077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07344v3","updated":"2025-01-14T10:35:19Z","published":"2024-12-10T09:37:25Z","title":"Virtual Reflections on a Dynamic 2D Eye Model Improve Spatial Reference\n Identification","summary":" The visible orientation of human eyes creates some transparency about\npeople's spatial attention and other mental states. This leads to a dual role\nfor the eyes as a means of sensing and communication. Accordingly, artificial\neye models are being explored as communication media in human-machine\ninteraction scenarios. One challenge in the use of eye models for communication\nconsists of resolving spatial reference ambiguities, especially for\nscreen-based models. Here, we introduce an approach for overcoming this\nchallenge through the introduction of reflection-like features that are\ncontingent on artificial eye movements. We conducted a user study with 30\nparticipants in which participants had to use spatial references provided by\ndynamic eye models to advance in a fast-paced group interaction task. Compared\nto a non-reflective eye model and a pure reflection mode, their combination in\nthe new approach resulted in a higher identification accuracy and user\nexperience, suggesting a synergistic benefit.\n","authors":["Matti Krüger","Yutaka Oshima","Yu Fang"],"pdf_url":"https://arxiv.org/pdf/2412.07344v3.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2406.03912v2","updated":"2025-01-14T10:32:32Z","published":"2024-06-06T09:51:30Z","title":"GenSafe: A Generalizable Safety Enhancer for Safe Reinforcement Learning\n Algorithms Based on Reduced Order Markov Decision Process Model","summary":" Safe Reinforcement Learning (SRL) aims to realize a safe learning process for\nDeep Reinforcement Learning (DRL) algorithms by incorporating safety\nconstraints. However, the efficacy of SRL approaches often relies on accurate\nfunction approximations, which are notably challenging to achieve in the early\nlearning stages due to data insufficiency. To address this issue, we introduce\nin this work a novel Generalizable Safety enhancer (GenSafe) that is able to\novercome the challenge of data insufficiency and enhance the performance of SRL\napproaches. Leveraging model order reduction techniques, we first propose an\ninnovative method to construct a Reduced Order Markov Decision Process (ROMDP)\nas a low-dimensional approximator of the original safety constraints. Then, by\nsolving the reformulated ROMDP-based constraints, GenSafe refines the actions\nof the agent to increase the possibility of constraint satisfaction.\nEssentially, GenSafe acts as an additional safety layer for SRL algorithms. We\nevaluate GenSafe on multiple SRL approaches and benchmark problems. The results\ndemonstrate its capability to improve safety performance, especially in the\nearly learning phases, while maintaining satisfactory task performance. Our\nproposed GenSafe not only offers a novel measure to augment existing SRL\nmethods but also shows broad compatibility with various SRL algorithms, making\nit applicable to a wide range of systems and SRL problems.\n","authors":["Zhehua Zhou","Xuan Xie","Jiayang Song","Zhan Shu","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2406.03912v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02297v2","updated":"2025-01-14T10:27:40Z","published":"2024-08-05T08:14:28Z","title":"Perception Matters: Enhancing Embodied AI with Uncertainty-Aware\n Semantic Segmentation","summary":" Embodied AI has made significant progress acting in unexplored environments.\nHowever, tasks such as object search have largely focused on efficient policy\nlearning. In this work, we identify several gaps in current search methods:\nThey largely focus on dated perception models, neglect temporal aggregation,\nand transfer from ground truth directly to noisy perception at test time,\nwithout accounting for the resulting overconfidence in the perceived state. We\naddress the identified problems through calibrated perception probabilities and\nuncertainty across aggregation and found decisions, thereby adapting the models\nfor sequential tasks. The resulting methods can be directly integrated with\npretrained models across a wide family of existing search approaches at no\nadditional training cost. We perform extensive evaluations of aggregation\nmethods across both different semantic perception models and policies,\nconfirming the importance of calibrated uncertainties in both the aggregation\nand found decisions. We make the code and trained models available at\nhttps://semantic-search.cs.uni-freiburg.de.\n","authors":["Sai Prasanna","Daniel Honerkamp","Kshitij Sirohi","Tim Welschehold","Wolfram Burgard","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2408.02297v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07985v1","updated":"2025-01-14T10:13:41Z","published":"2025-01-14T10:13:41Z","title":"CHEQ-ing the Box: Safe Variable Impedance Learning for Robotic Polishing","summary":" Robotic systems are increasingly employed for industrial automation, with\ncontact-rich tasks like polishing requiring dexterity and compliant behaviour.\nThese tasks are difficult to model, making classical control challenging. Deep\nreinforcement learning (RL) offers a promising solution by enabling the\nlearning of models and control policies directly from data. However, its\napplication to real-world problems is limited by data inefficiency and unsafe\nexploration. Adaptive hybrid RL methods blend classical control and RL\nadaptively, combining the strengths of both: structure from control and\nlearning from RL. This has led to improvements in data efficiency and\nexploration safety. However, their potential for hardware applications remains\nunderexplored, with no evaluations on physical systems to date. Such\nevaluations are critical to fully assess the practicality and effectiveness of\nthese methods in real-world settings. This work presents an experimental\ndemonstration of the hybrid RL algorithm CHEQ for robotic polishing with\nvariable impedance, a task requiring precise force and velocity tracking. In\nsimulation, we show that variable impedance enhances polishing performance. We\ncompare standalone RL with adaptive hybrid RL, demonstrating that CHEQ achieves\neffective learning while adhering to safety constraints. On hardware, CHEQ\nachieves effective polishing behaviour, requiring only eight hours of training\nand incurring just five failures. These results highlight the potential of\nadaptive hybrid RL for real-world, contact-rich tasks trained directly on\nhardware.\n","authors":["Emma Cramer","Lukas Jäschke","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2501.07985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09622v2","updated":"2025-01-14T09:22:35Z","published":"2024-04-15T09:49:33Z","title":"DIDLM: A SLAM Dataset for Difficult Scenarios Featuring Infrared, Depth\n Cameras, LIDAR, 4D Radar, and Others under Adverse Weather, Low Light\n Conditions, and Rough Roads","summary":" Adverse weather conditions, low-light environments, and bumpy road surfaces\npose significant challenges to SLAM in robotic navigation and autonomous\ndriving. Existing datasets in this field predominantly rely on single sensors\nor combinations of LiDAR, cameras, and IMUs. However, 4D millimeter-wave radar\ndemonstrates robustness in adverse weather, infrared cameras excel in capturing\ndetails under low-light conditions, and depth images provide richer spatial\ninformation. Multi-sensor fusion methods also show potential for better\nadaptation to bumpy roads. Despite some SLAM studies incorporating these\nsensors and conditions, there remains a lack of comprehensive datasets\naddressing low-light environments and bumpy road conditions, or featuring a\nsufficiently diverse range of sensor data. In this study, we introduce a\nmulti-sensor dataset covering challenging scenarios such as snowy weather,\nrainy weather, nighttime conditions, speed bumps, and rough terrains. The\ndataset includes rarely utilized sensors for extreme conditions, such as 4D\nmillimeter-wave radar, infrared cameras, and depth cameras, alongside 3D LiDAR,\nRGB cameras, GPS, and IMU. It supports both autonomous driving and ground robot\napplications and provides reliable GPS/INS ground truth data, covering\nstructured and semi-structured terrains. We evaluated various SLAM algorithms\nusing this dataset, including RGB images, infrared images, depth images, LiDAR,\nand 4D millimeter-wave radar. The dataset spans a total of 18.5 km, 69 minutes,\nand approximately 660 GB, offering a valuable resource for advancing SLAM\nresearch under complex and extreme conditions. Our dataset is available at\nhttps://github.com/GongWeiSheng/DIDLM.\n","authors":["Weisheng Gong","Kaijie Su","Qingyong Li","Chen He","Tong Wu","Z. Jane Wang"],"pdf_url":"https://arxiv.org/pdf/2404.09622v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07957v1","updated":"2025-01-14T09:21:17Z","published":"2025-01-14T09:21:17Z","title":"AI Guide Dog: Egocentric Path Prediction on Smartphone","summary":" This paper introduces AI Guide Dog (AIGD), a lightweight egocentric\nnavigation assistance system for visually impaired individuals, designed for\nreal-time deployment on smartphones. AIGD addresses key challenges in blind\nnavigation by employing a vision-only, multi-label classification approach to\npredict directional commands, ensuring safe traversal across diverse\nenvironments. We propose a novel technique to enable goal-based outdoor\nnavigation by integrating GPS signals and high-level directions, while also\naddressing uncertain multi-path predictions for destination-free indoor\nnavigation. Our generalized model is the first navigation assistance system to\nhandle both goal-oriented and exploratory navigation scenarios across indoor\nand outdoor settings, establishing a new state-of-the-art in blind navigation.\nWe present methods, datasets, evaluations, and deployment insights to encourage\nfurther innovations in assistive navigation systems.\n","authors":["Aishwarya Jadhav","Jeffery Cao","Abhishree Shetty","Urvashi Priyam Kumar","Aditi Sharma","Ben Sukboontip","Jayant Sravan Tamarapalli","Jingyi Zhang","Anirudh Koul"],"pdf_url":"https://arxiv.org/pdf/2501.07957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07317v2","updated":"2025-01-14T09:00:27Z","published":"2025-01-13T13:28:03Z","title":"Evaluation of Artificial Intelligence Methods for Lead Time Prediction\n in Non-Cycled Areas of Automotive Production","summary":" The present study examines the effectiveness of applying Artificial\nIntelligence methods in an automotive production environment to predict unknown\nlead times in a non-cycle-controlled production area. Data structures are\nanalyzed to identify contextual features and then preprocessed using one-hot\nencoding. Methods selection focuses on supervised machine learning techniques.\nIn supervised learning methods, regression and classification methods are\nevaluated. Continuous regression based on target size distribution is not\nfeasible. Classification methods analysis shows that Ensemble Learning and\nSupport Vector Machines are the most suitable. Preliminary study results\nindicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost\nyield the best results. After further testing and extensive hyperparameter\noptimization, the final method choice is the LightGBM algorithm. Depending on\nfeature availability and prediction interval granularity, relative prediction\naccuracies of up to 90% can be achieved. Further tests highlight the importance\nof periodic retraining of AI models to accurately represent complex production\nprocesses using the database. The research demonstrates that AI methods can be\neffectively applied to highly variable production data, adding business value\nby providing an additional metric for various control tasks while outperforming\ncurrent non AI-based systems.\n","authors":["Cornelius Hake","Jonas Weigele","Frederik Reichert","Christian Friedrich"],"pdf_url":"https://arxiv.org/pdf/2501.07317v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07255v2","updated":"2025-01-14T08:32:22Z","published":"2025-01-13T12:06:58Z","title":"GazeGrasp: DNN-Driven Robotic Grasping with Wearable Eye-Gaze Interface","summary":" We present GazeGrasp, a gaze-based manipulation system enabling individuals\nwith motor impairments to control collaborative robots using eye-gaze. The\nsystem employs an ESP32 CAM for eye tracking, MediaPipe for gaze detection, and\nYOLOv8 for object localization, integrated with a Universal Robot UR10 for\nmanipulation tasks. After user-specific calibration, the system allows\nintuitive object selection with a magnetic snapping effect and robot control\nvia eye gestures. Experimental evaluation involving 13 participants\ndemonstrated that the magnetic snapping effect significantly reduced gaze\nalignment time, improving task efficiency by 31%. GazeGrasp provides a robust,\nhands-free interface for assistive robotics, enhancing accessibility and\nautonomy for users.\n","authors":["Issatay Tokmurziyev","Miguel Altamirano Cabrera","Luis Moreno","Muhammad Haris Khan","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2501.07255v2.pdf","comment":"Accepted to: IEEE/ACM International Conference on Human-Robot\n Interaction (HRI 2025)"},{"id":"http://arxiv.org/abs/2501.06566v2","updated":"2025-01-14T05:06:42Z","published":"2025-01-11T15:06:34Z","title":"Cooperative Aerial Robot Inspection Challenge: A Benchmark for\n Heterogeneous Multi-UAV Planning and Lessons Learned","summary":" We propose the Cooperative Aerial Robot Inspection Challenge (CARIC), a\nsimulation-based benchmark for motion planning algorithms in heterogeneous\nmulti-UAV systems. CARIC features UAV teams with complementary sensors,\nrealistic constraints, and evaluation metrics prioritizing inspection quality\nand efficiency. It offers a ready-to-use perception-control software stack and\ndiverse scenarios to support the development and evaluation of task allocation\nand motion planning algorithms. Competitions using CARIC were held at IEEE CDC\n2023 and the IROS 2024 Workshop on Multi-Robot Perception and Navigation,\nattracting innovative solutions from research teams worldwide. This paper\nexamines the top three teams from CDC 2023, analyzing their exploration,\ninspection, and task allocation strategies while drawing insights into their\nperformance across scenarios. The results highlight the task's complexity and\nsuggest promising directions for future research in cooperative multi-UAV\nsystems.\n","authors":["Muqing Cao","Thien-Minh Nguyen","Shenghai Yuan","Andreas Anastasiou","Angelos Zacharia","Savvas Papaioannou","Panayiotis Kolios","Christos G. Panayiotou","Marios M. Polycarpou","Xinhang Xu","Mingjie Zhang","Fei Gao","Boyu Zhou","Ben M. Chen","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2501.06566v2.pdf","comment":"Please find our website at https://ntu-aris.github.io/caric"},{"id":"http://arxiv.org/abs/2501.07832v1","updated":"2025-01-14T04:26:49Z","published":"2025-01-14T04:26:49Z","title":"Low-Contact Grasping of Soft Tissue with Complex Geometry using a Vortex\n Gripper","summary":" Soft tissue manipulation is an integral aspect of most surgical procedures;\nhowever, the vast majority of surgical graspers used today are made of hard\nmaterials, such as metals or hard plastics. Furthermore, these graspers\npredominately function by pinching tissue between two hard objects as a method\nfor tissue manipulation. As such, the potential to apply too much force during\ncontact, and thus damage tissue, is inherently high. As an alternative\napproach, gaspers developed using a pneumatic vortex could potentially levitate\nsoft tissue, enabling manipulation with low or even no contact force. In this\npaper, we present the design and well as a full factorial study of the force\ncharacteristics of the vortex gripper grasping soft surfaces with four common\nshapes, with convex and concave curvature, and ranging over 10 different radii\nof curvature, for a total of 40 unique surfaces. By changing the parameters of\nthe nozzle elements in the design of the gripper, it was possible to\ninvestigate the influence of the mass flow parameters of the vortex gripper on\nthe lifting force for all of these different soft surfaces. An $\\pmb{ex}$\n$\\pmb{vivo}$ experiment was conducted on grasping biological tissues and soft\nballs of various shapes to show the advantages and disadvantages of the\nproposed technology. The obtained results allowed us to find limitations in the\nuse of vortex technology and the following stages of its improvement for\nmedical use.\n","authors":["Roman Mykhailyshyn","Ann Majewicz Fey"],"pdf_url":"https://arxiv.org/pdf/2501.07832v1.pdf","comment":"Submitted to T-MRB"},{"id":"http://arxiv.org/abs/2211.15975v4","updated":"2025-01-14T03:55:17Z","published":"2022-11-29T07:18:32Z","title":"Analyzing Infrastructure LiDAR Placement with Realistic LiDAR Simulation\n Library","summary":" Recently, Vehicle-to-Everything(V2X) cooperative perception has attracted\nincreasing attention. Infrastructure sensors play a critical role in this\nresearch field; however, how to find the optimal placement of infrastructure\nsensors is rarely studied. In this paper, we investigate the problem of\ninfrastructure sensor placement and propose a pipeline that can efficiently and\neffectively find optimal installation positions for infrastructure sensors in a\nrealistic simulated environment. To better simulate and evaluate LiDAR\nplacement, we establish a Realistic LiDAR Simulation library that can simulate\nthe unique characteristics of different popular LiDARs and produce\nhigh-fidelity LiDAR point clouds in the CARLA simulator. Through simulating\npoint cloud data in different LiDAR placements, we can evaluate the perception\naccuracy of these placements using multiple detection models. Then, we analyze\nthe correlation between the point cloud distribution and perception accuracy by\ncalculating the density and uniformity of regions of interest. Experiments show\nthat when using the same number and type of LiDAR, the placement scheme\noptimized by our proposed method improves the average precision by 15%,\ncompared with the conventional placement scheme in the standard lane scene. We\nalso analyze the correlation between perception performance in the region of\ninterest and LiDAR point cloud distribution and validate that density and\nuniformity can be indicators of performance. Both the RLS Library and related\ncode will be released at https://github.com/PJLab-ADG/PCSim.\n","authors":["Xinyu Cai","Wentao Jiang","Runsheng Xu","Wenquan Zhao","Jiaqi Ma","Si Liu","Yikang Li"],"pdf_url":"https://arxiv.org/pdf/2211.15975v4.pdf","comment":"7 pages, 6 figures, accepted to the IEEE International Conference on\n Robotics and Automation (ICRA'23)"},{"id":"http://arxiv.org/abs/2501.06783v2","updated":"2025-01-14T03:16:01Z","published":"2025-01-12T11:42:28Z","title":"Cost-Effective Robotic Handwriting System with AI Integration","summary":" This paper introduces a cost-effective robotic handwriting system designed to\nreplicate human-like handwriting with high precision. Combining a Raspberry Pi\nPico microcontroller, 3D-printed components, and a machine learning-based\nhandwriting generation model implemented via TensorFlow, the system converts\nuser-supplied text into realistic stroke trajectories. By leveraging\nlightweight 3D-printed materials and efficient mechanical designs, the system\nachieves a total hardware cost of approximately \\$56, significantly\nundercutting commercial alternatives. Experimental evaluations demonstrate\nhandwriting precision within $\\pm$0.3 millimeters and a writing speed of\napproximately 200 mm/min, positioning the system as a viable solution for\neducational, research, and assistive applications. This study seeks to lower\nthe barriers to personalized handwriting technologies, making them accessible\nto a broader audience.\n","authors":["Tianyi Huang","Richard Xiong"],"pdf_url":"https://arxiv.org/pdf/2501.06783v2.pdf","comment":"This is an updated version of a paper originally presented at the\n 2024 IEEE Long Island Systems, Applications and Technology Conference (LISAT)"},{"id":"http://arxiv.org/abs/2305.17217v5","updated":"2025-01-14T01:13:02Z","published":"2023-05-26T19:22:31Z","title":"Tactile-based Exploration, Mapping and Navigation with\n Collision-Resilient Aerial Vehicles","summary":" This article introduces XPLORER, a passive deformable UAV with a\nspring-augmented chassis and proprioceptive state awareness, designed to endure\ncollisions and maintain smooth contact. We develop a fast-converging external\nforce estimation algorithm for XPLORER that leverages onboard sensors and\nproprioceptive data for contact and collision detection. Using this force\ninformation, we propose four motion primitives, including three novel\ntactile-based primitives: tactile-traversal, tactile-turning, and\nricocheting-to aid XPLORER in navigating unknown environments. These primitives\nare synthesized autonomously in real-time to enable efficient exploration and\nnavigation by leveraging collisions and contacts. Experimental results\ndemonstrate the effectiveness of our approach, highlighting the potential of\npassive deformable UAVs for contact-rich real-world tasks such as\nnon-destructive inspection, surveillance and mapping, and pursuit/evasion.\n","authors":["Karishma Patnaik","Aravind Adhith Pandian Saravanakumaran","Wenlong Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.17217v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12176v3","updated":"2025-01-14T22:44:14Z","published":"2024-03-18T18:49:20Z","title":"Safety Implications of Explainable Artificial Intelligence in End-to-End\n Autonomous Driving","summary":" The end-to-end learning pipeline is gradually creating a paradigm shift in\nthe ongoing development of highly autonomous vehicles, largely due to advances\nin deep learning, the availability of large-scale training datasets, and\nimprovements in integrated sensor devices. However, a lack of explainability in\nreal-time decisions with contemporary learning methods impedes user trust and\nattenuates the widespread deployment and commercialization of such vehicles.\nMoreover, the issue is exacerbated when these cars are involved in or cause\ntraffic accidents. Consequently, explainability in end-to-end autonomous\ndriving is essential to build trust in vehicular automation. With that said,\nautomotive researchers have not yet rigorously explored safety benefits and\nconsequences of explanations in end-to-end autonomous driving. This paper aims\nto bridge the gaps between these topics and seeks to answer the following\nresearch question: What are safety implications of explanations in end-to-end\nautonomous driving? In this regard, we first revisit established safety and\nexplainability concepts in end-to-end driving. Furthermore, we present three\ncritical case studies and show the pivotal role of explanations in enhancing\nself-driving safety. Finally, we describe insights from empirical studies and\nreveal potential value, limitations, and caveats of practical explainable AI\nmethods with respect to their safety assurance in end-to-end driving.\n","authors":["Shahin Atakishiyev","Mohammad Salameh","Randy Goebel"],"pdf_url":"https://arxiv.org/pdf/2403.12176v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.06372v2","updated":"2025-01-14T22:43:44Z","published":"2024-10-08T21:14:09Z","title":"Cooperative and Asynchronous Transformer-based Mission Planning for\n Heterogeneous Teams of Mobile Robots","summary":" Cooperative mission planning for heterogeneous teams of mobile robots\npresents a unique set of challenges, particularly when operating under\ncommunication constraints and limited computational resources. To address these\nchallenges, we propose the Cooperative and Asynchronous Transformer-based\nMission Planning (CATMiP) framework, which leverages multi-agent reinforcement\nlearning (MARL) to coordinate distributed decision making among agents with\ndiverse sensing, motion, and actuation capabilities, operating under sporadic\nad hoc communication. A Class-based Macro-Action Decentralized Partially\nObservable Markov Decision Process (CMacDec-POMDP) is also formulated to\neffectively model asynchronous decision-making for heterogeneous teams of\nagents. The framework utilizes an asynchronous centralized training and\ndistributed execution scheme that is developed based on the Multi-Agent\nTransformer (MAT) architecture. This design allows a single trained model to\ngeneralize to larger environments and accommodate varying team sizes and\ncompositions. We evaluate CATMiP in a 2D grid-world simulation environment and\ncompare its performance against planning-based exploration methods. Results\ndemonstrate CATMiP's superior efficiency, scalability, and robustness to\ncommunication dropouts, highlighting its potential for real-world heterogeneous\nmobile robot systems. The code is available at\nhttps://github.com/mylad13/CATMiP.\n","authors":["Milad Farjadnasab","Shahin Sirouspour"],"pdf_url":"https://arxiv.org/pdf/2410.06372v2.pdf","comment":"27 pages, 8 figures, this work has been submitted to Elsevier for\n possible publication"},{"id":"http://arxiv.org/abs/2501.08469v1","updated":"2025-01-14T22:30:38Z","published":"2025-01-14T22:30:38Z","title":"Electrostatic Clutches Enable High-Force Mechanical Multiplexing:\n Demonstrating Single-Motor Full-Actuation of a 4-DoF Hand","summary":" This paper introduces a novel mechanical multiplexing system powered by\nelectrostatic capstan clutches, enabling high-force, single-motor control of\nmultiple degrees of freedom (DoF). The system is capable of both bidirectional\nsingle-input single-output time-division and single-input multiple-output\nmultiplexing to actuate a commercial 4-DoF robotic hand with a single motor.\nOur mechanical multiplexer is also capable of powerless position holding owing\nto its use of a leadscrew nut acting as the output. Experimental results\ndemonstrate the effectiveness of this approach, achieving individual and\nsimultaneous actuation. This innovation offers a scalable solution for high-DoF\nrobotic systems, providing a path to efficient actuation in robotic platforms.\n","authors":["Timothy E. Amish","Jeffrey T. Auletta","Chad C. Kessens","Joshua R. Smith","Jeffrey I. Lipton"],"pdf_url":"https://arxiv.org/pdf/2501.08469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12713v2","updated":"2025-01-14T22:29:03Z","published":"2024-09-18T05:28:12Z","title":"A Signal Temporal Logic Approach for Task-Based Coordination of\n Multi-Aerial Systems: a Wind Turbine Inspection Case Study","summary":" The paper addresses task assignment and trajectory generation for\ncollaborative inspection missions using a fleet of multi-rotors, focusing on\nthe wind turbine inspection scenario. The proposed solution enables safe and\nfeasible trajectories while accommodating heterogeneous time-bound constraints\nand vehicle physical limits. An optimization problem is formulated to meet\nmission objectives and temporal requirements encoded as Signal Temporal Logic\n(STL) specifications. Additionally, an event-triggered replanner is introduced\nto address unforeseen events and compensate for lost time. Furthermore, a\ngeneralized robustness scoring method is employed to reflect user preferences\nand mitigate task conflicts. The effectiveness of the proposed approach is\ndemonstrated through MATLAB and Gazebo simulations, as well as field\nmulti-robot experiments in a mock-up scenario.\n","authors":["Giuseppe Silano","Alvaro Caballero","Davide Liuzza","Luigi Iannelli","Stjepan Bogdan","Martin Saska"],"pdf_url":"https://arxiv.org/pdf/2409.12713v2.pdf","comment":"\\c{opyright}2025 Elsevier. This work has been accepted to \"Robotics\n and Autonomous Systems\" for possible publication. Personal use of this\n material is permitted. Permission from Elsevier must be obtained for all\n other uses"},{"id":"http://arxiv.org/abs/2501.04693v3","updated":"2025-01-14T22:28:39Z","published":"2025-01-08T18:57:33Z","title":"Beyond Sight: Finetuning Generalist Robot Policies with Heterogeneous\n Sensors via Language Grounding","summary":" Interacting with the world is a multi-sensory experience: achieving effective\ngeneral-purpose interaction requires making use of all available modalities --\nincluding vision, touch, and audio -- to fill in gaps from partial observation.\nFor example, when vision is occluded reaching into a bag, a robot should rely\non its senses of touch and sound. However, state-of-the-art generalist robot\npolicies are typically trained on large datasets to predict robot actions\nsolely from visual and proprioceptive observations. In this work, we propose\nFuSe, a novel approach that enables finetuning visuomotor generalist policies\non heterogeneous sensor modalities for which large datasets are not readily\navailable by leveraging natural language as a common cross-modal grounding. We\ncombine a multimodal contrastive loss with a sensory-grounded language\ngeneration loss to encode high-level semantics. In the context of robot\nmanipulation, we show that FuSe enables performing challenging tasks that\nrequire reasoning jointly over modalities such as vision, touch, and sound in a\nzero-shot setting, such as multimodal prompting, compositional cross-modal\nprompting, and descriptions of objects it interacts with. We show that the same\nrecipe is applicable to widely different generalist policies, including both\ndiffusion-based generalist policies and large vision-language-action (VLA)\nmodels. Extensive experiments in the real world show that FuSeis able to\nincrease success rates by over 20% compared to all considered baselines.\n","authors":["Joshua Jones","Oier Mees","Carmelo Sferrazza","Kyle Stachowicz","Pieter Abbeel","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2501.04693v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.02459v3","updated":"2025-01-14T22:07:08Z","published":"2022-12-05T18:02:46Z","title":"Resilient Distributed Optimization for Multi-Agent Cyberphysical Systems","summary":" This work focuses on the problem of distributed optimization in multi-agent\ncyberphysical systems, where a legitimate agent's iterates are influenced both\nby the values it receives from potentially malicious neighboring agents, and by\nits own self-serving target function. We develop a new algorithmic and\nanalytical framework to achieve resilience for the class of problems where\nstochastic values of trust between agents exist and can be exploited. In this\ncase, we show that convergence to the true global optimal point can be\nrecovered, both in mean and almost surely, even in the presence of malicious\nagents. Furthermore, we provide expected convergence rate guarantees in the\nform of upper bounds on the expected squared distance to the optimal value.\nFinally, numerical results are presented that validate our analytical\nconvergence guarantees even when the malicious agents compose the majority of\nagents in the network and where existing methods fail to converge to the\noptimal nominal points.\n","authors":["Michal Yemini","Angelia Nedić","Andrea J. Goldsmith","Stephanie Gil"],"pdf_url":"https://arxiv.org/pdf/2212.02459v3.pdf","comment":"Accepted for publication in the IEEE Transactions on Automatic\n Control"},{"id":"http://arxiv.org/abs/2403.16689v3","updated":"2025-01-14T21:37:31Z","published":"2024-03-25T12:23:39Z","title":"SYNAPSE: SYmbolic Neural-Aided Preference Synthesis Engine","summary":" This paper addresses the problem of preference learning, which aims to align\nrobot behaviors through learning user specific preferences (e.g. \"good\npull-over location\") from visual demonstrations. Despite its similarity to\nlearning factual concepts (e.g. \"red door\"), preference learning is a\nfundamentally harder problem due to its subjective nature and the paucity of\nperson-specific training data. We address this problem using a novel framework\ncalled SYNAPSE, which is a neuro-symbolic approach designed to efficiently\nlearn preferential concepts from limited data. SYNAPSE represents preferences\nas neuro-symbolic programs, facilitating inspection of individual parts for\nalignment, in a domain-specific language (DSL) that operates over images and\nleverages a novel combination of visual parsing, large language models, and\nprogram synthesis to learn programs representing individual preferences. We\nperform extensive evaluations on various preferential concepts as well as user\ncase studies demonstrating its ability to align well with dissimilar user\npreferences. Our method significantly outperforms baselines, especially when it\ncomes to out of distribution generalization. We show the importance of the\ndesign choices in the framework through multiple ablation studies. Code,\nadditional results, and supplementary material can be found on the website:\nhttps://amrl.cs.utexas.edu/synapse\n","authors":["Sadanand Modak","Noah Patton","Isil Dillig","Joydeep Biswas"],"pdf_url":"https://arxiv.org/pdf/2403.16689v3.pdf","comment":"Accepted (oral) at AAAI 25"},{"id":"http://arxiv.org/abs/2501.07295v2","updated":"2025-01-14T19:39:23Z","published":"2025-01-13T13:01:21Z","title":"GestLLM: Advanced Hand Gesture Interpretation via Large Language Models\n for Human-Robot Interaction","summary":" This paper introduces GestLLM, an advanced system for human-robot interaction\nthat enables intuitive robot control through hand gestures. Unlike conventional\nsystems, which rely on a limited set of predefined gestures, GestLLM leverages\nlarge language models and feature extraction via MediaPipe to interpret a\ndiverse range of gestures. This integration addresses key limitations in\nexisting systems, such as restricted gesture flexibility and the inability to\nrecognize complex or unconventional gestures commonly used in human\ncommunication.\n By combining state-of-the-art feature extraction and language model\ncapabilities, GestLLM achieves performance comparable to leading\nvision-language models while supporting gestures underrepresented in\ntraditional datasets. For example, this includes gestures from popular culture,\nsuch as the ``Vulcan salute\" from Star Trek, without any additional\npretraining, prompt engineering, etc. This flexibility enhances the naturalness\nand inclusivity of robot control, making interactions more intuitive and\nuser-friendly.\n GestLLM provides a significant step forward in gesture-based interaction,\nenabling robots to understand and respond to a wide variety of hand gestures\neffectively. This paper outlines its design, implementation, and evaluation,\ndemonstrating its potential applications in advanced human-robot collaboration,\nassistive robotics, and interactive entertainment.\n","authors":["Oleg Kobzarev","Artem Lykov","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2501.07295v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08389v1","updated":"2025-01-14T19:06:44Z","published":"2025-01-14T19:06:44Z","title":"Toward Zero-Shot User Intent Recognition in Shared Autonomy","summary":" A fundamental challenge of shared autonomy is to use high-DoF robots to\nassist, rather than hinder, humans by first inferring user intent and then\nempowering the user to achieve their intent. Although successful, prior methods\neither rely heavily on a priori knowledge of all possible human intents or\nrequire many demonstrations and interactions with the human to learn these\nintents before being able to assist the user. We propose and study a zero-shot,\nvision-only shared autonomy (VOSA) framework designed to allow robots to use\nend-effector vision to estimate zero-shot human intents in conjunction with\nblended control to help humans accomplish manipulation tasks with unknown and\ndynamically changing object locations. To demonstrate the effectiveness of our\nVOSA framework, we instantiate a simple version of VOSA on a Kinova Gen3\nmanipulator and evaluate our system by conducting a user study on three\ntabletop manipulation tasks. The performance of VOSA matches that of an oracle\nbaseline model that receives privileged knowledge of possible human intents\nwhile also requiring significantly less effort than unassisted teleoperation.\nIn more realistic settings, where the set of possible human intents is fully or\npartially unknown, we demonstrate that VOSA requires less human effort and time\nthan baseline approaches while being preferred by a majority of the\nparticipants. Our results demonstrate the efficacy and efficiency of using\noff-the-shelf vision algorithms to enable flexible and beneficial shared\ncontrol of a robot manipulator. Code and videos available here:\nhttps://sites.google.com/view/zeroshot-sharedautonomy/home.\n","authors":["Atharv Belsare","Zohre Karimi","Connor Mattson","Daniel S. Brown"],"pdf_url":"https://arxiv.org/pdf/2501.08389v1.pdf","comment":"10 pages, 6 figures, Accepted to IEEE/ACM International Conference on\n Human-Robot Interaction (HRI), 2025. Equal Contribution from the first three\n authors"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.08333v1","updated":"2025-01-14T18:59:59Z","published":"2025-01-14T18:59:59Z","title":"DAViD: Modeling Dynamic Affordance of 3D Objects using Pre-trained Video\n Diffusion Models","summary":" Understanding the ability of humans to use objects is crucial for AI to\nimprove daily life. Existing studies for learning such ability focus on\nhuman-object patterns (e.g., contact, spatial relation, orientation) in static\nsituations, and learning Human-Object Interaction (HOI) patterns over time\n(i.e., movement of human and object) is relatively less explored. In this\npaper, we introduce a novel type of affordance named Dynamic Affordance. For a\ngiven input 3D object mesh, we learn dynamic affordance which models the\ndistribution of both (1) human motion and (2) human-guided object pose during\ninteractions. As a core idea, we present a method to learn the 3D dynamic\naffordance from synthetically generated 2D videos, leveraging a pre-trained\nvideo diffusion model. Specifically, we propose a pipeline that first generates\n2D HOI videos from the 3D object and then lifts them into 3D to generate 4D HOI\nsamples. Once we generate diverse 4D HOI samples on various target objects, we\ntrain our DAViD, where we present a method based on the Low-Rank Adaptation\n(LoRA) module for pre-trained human motion diffusion model (MDM) and an object\npose diffusion model with human pose guidance. Our motion diffusion model is\nextended for multi-object interactions, demonstrating the advantage of our\npipeline with LoRA for combining the concepts of object usage. Through\nextensive experiments, we demonstrate our DAViD outperforms the baselines in\ngenerating human motion with HOIs.\n","authors":["Hyeonwoo Kim","Sangwon Beak","Hanbyul Joo"],"pdf_url":"https://arxiv.org/pdf/2501.08333v1.pdf","comment":"Project Page: https://snuvclab.github.io/david/"},{"id":"http://arxiv.org/abs/2501.08332v1","updated":"2025-01-14T18:59:55Z","published":"2025-01-14T18:59:55Z","title":"MangaNinja: Line Art Colorization with Precise Reference Following","summary":" Derived from diffusion models, MangaNinjia specializes in the task of\nreference-guided line art colorization. We incorporate two thoughtful designs\nto ensure precise character detail transcription, including a patch shuffling\nmodule to facilitate correspondence learning between the reference color image\nand the target line art, and a point-driven control scheme to enable\nfine-grained color matching. Experiments on a self-collected benchmark\ndemonstrate the superiority of our model over current solutions in terms of\nprecise colorization. We further showcase the potential of the proposed\ninteractive point control in handling challenging cases, cross-character\ncolorization, multi-reference harmonization, beyond the reach of existing\nalgorithms.\n","authors":["Zhiheng Liu","Ka Leong Cheng","Xi Chen","Jie Xiao","Hao Ouyang","Kai Zhu","Yu Liu","Yujun Shen","Qifeng Chen","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2501.08332v1.pdf","comment":"Project page and code: https://johanan528.github.io/MangaNinjia/"},{"id":"http://arxiv.org/abs/2501.08331v1","updated":"2025-01-14T18:59:10Z","published":"2025-01-14T18:59:10Z","title":"Go-with-the-Flow: Motion-Controllable Video Diffusion Models Using\n Real-Time Warped Noise","summary":" Generative modeling aims to transform random noise into structured outputs.\nIn this work, we enhance video diffusion models by allowing motion control via\nstructured latent noise sampling. This is achieved by just a change in data: we\npre-process training videos to yield structured noise. Consequently, our method\nis agnostic to diffusion model design, requiring no changes to model\narchitectures or training pipelines. Specifically, we propose a novel noise\nwarping algorithm, fast enough to run in real time, that replaces random\ntemporal Gaussianity with correlated warped noise derived from optical flow\nfields, while preserving the spatial Gaussianity. The efficiency of our\nalgorithm enables us to fine-tune modern video diffusion base models using\nwarped noise with minimal overhead, and provide a one-stop solution for a wide\nrange of user-friendly motion control: local object motion control, global\ncamera movement control, and motion transfer. The harmonization between\ntemporal coherence and spatial Gaussianity in our warped noise leads to\neffective motion control while maintaining per-frame pixel quality. Extensive\nexperiments and user studies demonstrate the advantages of our method, making\nit a robust and scalable approach for controlling motion in video diffusion\nmodels. Video results are available on our webpage:\nhttps://vgenai-netflix-eyeline-research.github.io/Go-with-the-Flow/; source\ncode and model checkpoints are available on GitHub:\nhttps://github.com/VGenAI-Netflix-Eyeline-Research/Go-with-the-Flow.\n","authors":["Ryan Burgert","Yuancheng Xu","Wenqi Xian","Oliver Pilarski","Pascal Clausen","Mingming He","Li Ma","Yitong Deng","Lingxiao Li","Mohsen Mousavi","Michael Ryoo","Paul Debevec","Ning Yu"],"pdf_url":"https://arxiv.org/pdf/2501.08331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08329v1","updated":"2025-01-14T18:59:05Z","published":"2025-01-14T18:59:05Z","title":"Predicting 4D Hand Trajectory from Monocular Videos","summary":" We present HaPTIC, an approach that infers coherent 4D hand trajectories from\nmonocular videos. Current video-based hand pose reconstruction methods\nprimarily focus on improving frame-wise 3D pose using adjacent frames rather\nthan studying consistent 4D hand trajectories in space. Despite the additional\ntemporal cues, they generally underperform compared to image-based methods due\nto the scarcity of annotated video data. To address these issues, we repurpose\na state-of-the-art image-based transformer to take in multiple frames and\ndirectly predict a coherent trajectory. We introduce two types of lightweight\nattention layers: cross-view self-attention to fuse temporal information, and\nglobal cross-attention to bring in larger spatial context. Our method infers 4D\nhand trajectories similar to the ground truth while maintaining strong 2D\nreprojection alignment. We apply the method to both egocentric and allocentric\nvideos. It significantly outperforms existing methods in global trajectory\naccuracy while being comparable to the state-of-the-art in single-image pose\nestimation. Project website: https://judyye.github.io/haptic-www\n","authors":["Yufei Ye","Yao Feng","Omid Taheri","Haiwen Feng","Shubham Tulsiani","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2501.08329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08326v1","updated":"2025-01-14T18:58:04Z","published":"2025-01-14T18:58:04Z","title":"Omni-RGPT: Unifying Image and Video Region-level Understanding via Token\n Marks","summary":" We present Omni-RGPT, a multimodal large language model designed to\nfacilitate region-level comprehension for both images and videos. To achieve\nconsistent region representation across spatio-temporal dimensions, we\nintroduce Token Mark, a set of tokens highlighting the target regions within\nthe visual feature space. These tokens are directly embedded into spatial\nregions using region prompts (e.g., boxes or masks) and simultaneously\nincorporated into the text prompt to specify the target, establishing a direct\nconnection between visual and text tokens. To further support robust video\nunderstanding without requiring tracklets, we introduce an auxiliary task that\nguides Token Mark by leveraging the consistency of the tokens, enabling stable\nregion interpretation across the video. Additionally, we introduce a\nlarge-scale region-level video instruction dataset (RegVID-300k). Omni-RGPT\nachieves state-of-the-art results on image and video-based commonsense\nreasoning benchmarks while showing strong performance in captioning and\nreferring expression comprehension tasks.\n","authors":["Miran Heo","Min-Hung Chen","De-An Huang","Sifei Liu","Subhashree Radhakrishnan","Seon Joo Kim","Yu-Chiang Frank Wang","Ryo Hachiuma"],"pdf_url":"https://arxiv.org/pdf/2501.08326v1.pdf","comment":"Project page: https://miranheo.github.io/omni-rgpt/"},{"id":"http://arxiv.org/abs/2501.08325v1","updated":"2025-01-14T18:57:21Z","published":"2025-01-14T18:57:21Z","title":"GameFactory: Creating New Games with Generative Interactive Videos","summary":" Generative game engines have the potential to revolutionize game development\nby autonomously creating new content and reducing manual workload. However,\nexisting video-based game generation methods fail to address the critical\nchallenge of scene generalization, limiting their applicability to existing\ngames with fixed styles and scenes. In this paper, we present GameFactory, a\nframework focused on exploring scene generalization in game video generation.\nTo enable the creation of entirely new and diverse games, we leverage\npre-trained video diffusion models trained on open-domain video data. To bridge\nthe domain gap between open-domain priors and small-scale game dataset, we\npropose a multi-phase training strategy that decouples game style learning from\naction control, preserving open-domain generalization while achieving action\ncontrollability. Using Minecraft as our data source, we release GF-Minecraft, a\nhigh-quality and diversity action-annotated video dataset for research.\nFurthermore, we extend our framework to enable autoregressive\naction-controllable game video generation, allowing the production of\nunlimited-length interactive game videos. Experimental results demonstrate that\nGameFactory effectively generates open-domain, diverse, and action-controllable\ngame videos, representing a significant step forward in AI-driven game\ngeneration. Our dataset and project page are publicly available at\n\\url{https://vvictoryuki.github.io/gamefactory/}.\n","authors":["Jiwen Yu","Yiran Qin","Xintao Wang","Pengfei Wan","Di Zhang","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08316v1","updated":"2025-01-14T18:51:48Z","published":"2025-01-14T18:51:48Z","title":"Diffusion Adversarial Post-Training for One-Step Video Generation","summary":" The diffusion models are widely used for image and video generation, but\ntheir iterative generation process is slow and expansive. While existing\ndistillation approaches have demonstrated the potential for one-step generation\nin the image domain, they still suffer from significant quality degradation. In\nthis work, we propose Adversarial Post-Training (APT) against real data\nfollowing diffusion pre-training for one-step video generation. To improve the\ntraining stability and quality, we introduce several improvements to the model\narchitecture and training procedures, along with an approximated R1\nregularization objective. Empirically, our experiments show that our\nadversarial post-trained model, Seaweed-APT, can generate 2-second, 1280x720,\n24fps videos in real time using a single forward evaluation step. Additionally,\nour model is capable of generating 1024px images in a single step, achieving\nquality comparable to state-of-the-art methods.\n","authors":["Shanchuan Lin","Xin Xia","Yuxi Ren","Ceyuan Yang","Xuefeng Xiao","Lu Jiang"],"pdf_url":"https://arxiv.org/pdf/2501.08316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07169v3","updated":"2025-01-14T18:51:43Z","published":"2024-12-10T04:03:46Z","title":"Rate-In: Information-Driven Adaptive Dropout Rates for Improved\n Inference-Time Uncertainty Estimation","summary":" Accurate uncertainty estimation is crucial for deploying neural networks in\nrisk-sensitive applications such as medical diagnosis. Monte Carlo Dropout is a\nwidely used technique for approximating predictive uncertainty by performing\nstochastic forward passes with dropout during inference. However, using static\ndropout rates across all layers and inputs can lead to suboptimal uncertainty\nestimates, as it fails to adapt to the varying characteristics of individual\ninputs and network layers. Existing approaches optimize dropout rates during\ntraining using labeled data, resulting in fixed inference-time parameters that\ncannot adjust to new data distributions, compromising uncertainty estimates in\nMonte Carlo simulations.\n In this paper, we propose Rate-In, an algorithm that dynamically adjusts\ndropout rates during inference by quantifying the information loss induced by\ndropout in each layer's feature maps. By treating dropout as controlled noise\ninjection and leveraging information-theoretic principles, Rate-In adapts\ndropout rates per layer and per input instance without requiring ground truth\nlabels. By quantifying the functional information loss in feature maps, we\nadaptively tune dropout rates to maintain perceptual quality across diverse\nmedical imaging tasks and architectural configurations. Our extensive empirical\nstudy on synthetic data and real-world medical imaging tasks demonstrates that\nRate-In improves calibration and sharpens uncertainty estimates compared to\nfixed or heuristic dropout rates without compromising predictive performance.\nRate-In offers a practical, unsupervised, inference-time approach to optimizing\ndropout for more reliable predictive uncertainty estimation in critical\napplications.\n","authors":["Tal Zeevi","Ravid Shwartz-Ziv","Yann LeCun","Lawrence H. Staib","John A. Onofrey"],"pdf_url":"https://arxiv.org/pdf/2412.07169v3.pdf","comment":"Updated author affiliation"},{"id":"http://arxiv.org/abs/2501.08313v1","updated":"2025-01-14T18:50:05Z","published":"2025-01-14T18:50:05Z","title":"MiniMax-01: Scaling Foundation Models with Lightning Attention","summary":" We introduce MiniMax-01 series, including MiniMax-Text-01 and MiniMax-VL-01,\nwhich are comparable to top-tier models while offering superior capabilities in\nprocessing longer contexts. The core lies in lightning attention and its\nefficient scaling. To maximize computational capacity, we integrate it with\nMixture of Experts (MoE), creating a model with 32 experts and 456 billion\ntotal parameters, of which 45.9 billion are activated for each token. We\ndevelop an optimized parallel strategy and highly efficient\ncomputation-communication overlap techniques for MoE and lightning attention.\nThis approach enables us to conduct efficient training and inference on models\nwith hundreds of billions of parameters across contexts spanning millions of\ntokens. The context window of MiniMax-Text-01 can reach up to 1 million tokens\nduring training and extrapolate to 4 million tokens during inference at an\naffordable cost. Our vision-language model, MiniMax-VL-01 is built through\ncontinued training with 512 billion vision-language tokens. Experiments on both\nstandard and in-house benchmarks show that our models match the performance of\nstate-of-the-art models like GPT-4o and Claude-3.5-Sonnet while offering 20-32\ntimes longer context window. We publicly release MiniMax-01 at\nhttps://github.com/MiniMax-AI.\n","authors":[" MiniMax","Aonian Li","Bangwei Gong","Bo Yang","Boji Shan","Chang Liu","Cheng Zhu","Chunhao Zhang","Congchao Guo","Da Chen","Dong Li","Enwei Jiao","Gengxin Li","Guojun Zhang","Haohai Sun","Houze Dong","Jiadai Zhu","Jiaqi Zhuang","Jiayuan Song","Jin Zhu","Jingtao Han","Jingyang Li","Junbin Xie","Junhao Xu","Junjie Yan","Kaishun Zhang","Kecheng Xiao","Kexi Kang","Le Han","Leyang Wang","Lianfei Yu","Liheng Feng","Lin Zheng","Linbo Chai","Long Xing","Meizhi Ju","Mingyuan Chi","Mozhi Zhang","Peikai Huang","Pengcheng Niu","Pengfei Li","Pengyu Zhao","Qi Yang","Qidi Xu","Qiexiang Wang","Qin Wang","Qiuhui Li","Ruitao Leng","Shengmin Shi","Shuqi Yu","Sichen Li","Songquan Zhu","Tao Huang","Tianrun Liang","Weigao Sun","Weixuan Sun","Weiyu Cheng","Wenkai Li","Xiangjun Song","Xiao Su","Xiaodong Han","Xinjie Zhang","Xinzhu Hou","Xu Min","Xun Zou","Xuyang Shen","Yan Gong","Yingjie Zhu","Yipeng Zhou","Yiran Zhong","Yongyi Hu","Yuanxiang Fan","Yue Yu","Yufeng Yang","Yuhao Li","Yunan Huang","Yunji Li","Yunpeng Huang","Yunzhi Xu","Yuxin Mao","Zehan Li","Zekang Li","Zewei Tao","Zewen Ying","Zhaoyang Cong","Zhen Qin","Zhenhua Fan","Zhihang Yu","Zhuo Jiang","Zijia Wu"],"pdf_url":"https://arxiv.org/pdf/2501.08313v1.pdf","comment":"A technical report from MiniMax. The authors are listed in\n alphabetical order. We open-sourced our MiniMax-01 at\n https://github.com/MiniMax-AI"},{"id":"http://arxiv.org/abs/2501.08303v1","updated":"2025-01-14T18:34:14Z","published":"2025-01-14T18:34:14Z","title":"Advancing Semantic Future Prediction through Multimodal Visual Sequence\n Transformers","summary":" Semantic future prediction is important for autonomous systems navigating\ndynamic environments. This paper introduces FUTURIST, a method for multimodal\nfuture semantic prediction that uses a unified and efficient visual sequence\ntransformer architecture. Our approach incorporates a multimodal masked visual\nmodeling objective and a novel masking mechanism designed for multimodal\ntraining. This allows the model to effectively integrate visible information\nfrom various modalities, improving prediction accuracy. Additionally, we\npropose a VAE-free hierarchical tokenization process, which reduces\ncomputational complexity, streamlines the training pipeline, and enables\nend-to-end training with high-resolution, multimodal inputs. We validate\nFUTURIST on the Cityscapes dataset, demonstrating state-of-the-art performance\nin future semantic segmentation for both short- and mid-term forecasting. We\nprovide the implementation code at https://github.com/Sta8is/FUTURIST .\n","authors":["Efstathios Karypidis","Ioannis Kakogeorgiou","Spyros Gidaris","Nikos Komodakis"],"pdf_url":"https://arxiv.org/pdf/2501.08303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08295v1","updated":"2025-01-14T18:22:21Z","published":"2025-01-14T18:22:21Z","title":"LayerAnimate: Layer-specific Control for Animation","summary":" Animated video separates foreground and background elements into layers, with\ndistinct processes for sketching, refining, coloring, and in-betweening.\nExisting video generation methods typically treat animation as a monolithic\ndata domain, lacking fine-grained control over individual layers. In this\npaper, we introduce LayerAnimate, a novel architectural approach that enhances\nfine-grained control over individual animation layers within a video diffusion\nmodel, allowing users to independently manipulate foreground and background\nelements in distinct layers. To address the challenge of limited layer-specific\ndata, we propose a data curation pipeline that features automated element\nsegmentation, motion-state hierarchical merging, and motion coherence\nrefinement. Through quantitative and qualitative comparisons, and user study,\nwe demonstrate that LayerAnimate outperforms current methods in terms of\nanimation quality, control precision, and usability, making it an ideal tool\nfor both professional animators and amateur enthusiasts. This framework opens\nup new possibilities for layer-specific animation applications and creative\nflexibility. Our code is available at https://layeranimate.github.io.\n","authors":["Yuxue Yang","Lue Fan","Zuzen Lin","Feng Wang","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.08295v1.pdf","comment":"Project page: https://layeranimate.github.io"},{"id":"http://arxiv.org/abs/2407.04545v2","updated":"2025-01-14T18:20:45Z","published":"2024-07-05T14:30:24Z","title":"Gaussian Eigen Models for Human Heads","summary":" Current personalized neural head avatars face a trade-off: lightweight models\nlack detail and realism, while high-quality, animatable avatars require\nsignificant computational resources, making them unsuitable for commodity\ndevices. To address this gap, we introduce Gaussian Eigen Models (GEM), which\nprovide high-quality, lightweight, and easily controllable head avatars. GEM\nutilizes 3D Gaussian primitives for representing the appearance combined with\nGaussian splatting for rendering. Building on the success of mesh-based 3D\nmorphable face models (3DMM), we define GEM as an ensemble of linear eigenbases\nfor representing the head appearance of a specific subject. In particular, we\nconstruct linear bases to represent the position, scale, rotation, and opacity\nof the 3D Gaussians. This allows us to efficiently generate Gaussian primitives\nof a specific head shape by a linear combination of the basis vectors, only\nrequiring a low-dimensional parameter vector that contains the respective\ncoefficients. We propose to construct these linear bases (GEM) by distilling\nhigh-quality compute-intense CNN-based Gaussian avatar models that can generate\nexpression-dependent appearance changes like wrinkles. These high-quality\nmodels are trained on multi-view videos of a subject and are distilled using a\nseries of principal component analyses. Once we have obtained the bases that\nrepresent the animatable appearance space of a specific human, we learn a\nregressor that takes a single RGB image as input and predicts the\nlow-dimensional parameter vector that corresponds to the shown facial\nexpression. In a series of experiments, we compare GEM's self-reenactment and\ncross-person reenactment results to state-of-the-art 3D avatar methods,\ndemonstrating GEM's higher visual quality and better generalization to new\nexpressions.\n","authors":["Wojciech Zielonka","Timo Bolkart","Thabo Beeler","Justus Thies"],"pdf_url":"https://arxiv.org/pdf/2407.04545v2.pdf","comment":"https://zielon.github.io/gem/"},{"id":"http://arxiv.org/abs/2410.24031v2","updated":"2025-01-14T18:03:42Z","published":"2024-10-31T15:29:51Z","title":"A Multi-Modal Approach for Face Anti-Spoofing in Non-Calibrated Systems\n using Disparity Maps","summary":" Face recognition technologies are increasingly used in various applications,\nyet they are vulnerable to face spoofing attacks. These spoofing attacks often\ninvolve unique 3D structures, such as printed papers or mobile device screens.\nAlthough stereo-depth cameras can detect such attacks effectively, their\nhigh-cost limits their widespread adoption. Conversely, two-sensor systems\nwithout extrinsic calibration offer a cost-effective alternative but are unable\nto calculate depth using stereo techniques. In this work, we propose a method\nto overcome this challenge by leveraging facial attributes to derive disparity\ninformation and estimate relative depth for anti-spoofing purposes, using\nnon-calibrated systems. We introduce a multi-modal anti-spoofing model, coined\nDisparity Model, that incorporates created disparity maps as a third modality\nalongside the two original sensor modalities. We demonstrate the effectiveness\nof the Disparity Model in countering various spoof attacks using a\ncomprehensive dataset collected from the Intel RealSense ID Solution F455. Our\nmethod outperformed existing methods in the literature, achieving an Equal\nError Rate (EER) of 1.71% and a False Negative Rate (FNR) of 2.77% at a False\nPositive Rate (FPR) of 1%. These errors are lower by 2.45% and 7.94% than the\nerrors of the best comparison method, respectively. Additionally, we introduce\na model ensemble that addresses 3D spoof attacks as well, achieving an EER of\n2.04% and an FNR of 3.83% at an FPR of 1%. Overall, our work provides a\nstate-of-the-art solution for the challenging task of anti-spoofing in\nnon-calibrated systems that lack depth information.\n","authors":["Ariel Larey","Eyal Rond","Omer Achrack"],"pdf_url":"https://arxiv.org/pdf/2410.24031v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08286v1","updated":"2025-01-14T18:01:15Z","published":"2025-01-14T18:01:15Z","title":"VINGS-Mono: Visual-Inertial Gaussian Splatting Monocular SLAM in Large\n Scenes","summary":" VINGS-Mono is a monocular (inertial) Gaussian Splatting (GS) SLAM framework\ndesigned for large scenes. The framework comprises four main components: VIO\nFront End, 2D Gaussian Map, NVS Loop Closure, and Dynamic Eraser. In the VIO\nFront End, RGB frames are processed through dense bundle adjustment and\nuncertainty estimation to extract scene geometry and poses. Based on this\noutput, the mapping module incrementally constructs and maintains a 2D Gaussian\nmap. Key components of the 2D Gaussian Map include a Sample-based Rasterizer,\nScore Manager, and Pose Refinement, which collectively improve mapping speed\nand localization accuracy. This enables the SLAM system to handle large-scale\nurban environments with up to 50 million Gaussian ellipsoids. To ensure global\nconsistency in large-scale scenes, we design a Loop Closure module, which\ninnovatively leverages the Novel View Synthesis (NVS) capabilities of Gaussian\nSplatting for loop closure detection and correction of the Gaussian map.\nAdditionally, we propose a Dynamic Eraser to address the inevitable presence of\ndynamic objects in real-world outdoor scenes. Extensive evaluations in indoor\nand outdoor environments demonstrate that our approach achieves localization\nperformance on par with Visual-Inertial Odometry while surpassing recent\nGS/NeRF SLAM methods. It also significantly outperforms all existing methods in\nterms of mapping and rendering quality. Furthermore, we developed a mobile app\nand verified that our framework can generate high-quality Gaussian maps in real\ntime using only a smartphone camera and a low-frequency IMU sensor. To the best\nof our knowledge, VINGS-Mono is the first monocular Gaussian SLAM method\ncapable of operating in outdoor environments and supporting kilometer-scale\nlarge scenes.\n","authors":["Ke Wu","Zicheng Zhang","Muer Tie","Ziqing Ai","Zhongxue Gan","Wenchao Ding"],"pdf_url":"https://arxiv.org/pdf/2501.08286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08285v1","updated":"2025-01-14T18:00:41Z","published":"2025-01-14T18:00:41Z","title":"Can Bayesian Neural Networks Explicitly Model Input Uncertainty?","summary":" Inputs to machine learning models can have associated noise or uncertainties,\nbut they are often ignored and not modelled. It is unknown if Bayesian Neural\nNetworks and their approximations are able to consider uncertainty in their\ninputs. In this paper we build a two input Bayesian Neural Network (mean and\nstandard deviation) and evaluate its capabilities for input uncertainty\nestimation across different methods like Ensembles, MC-Dropout, and Flipout.\nOur results indicate that only some uncertainty estimation methods for\napproximate Bayesian NNs can model input uncertainty, in particular Ensembles\nand Flipout.\n","authors":["Matias Valdenegro-Toro","Marco Zullich"],"pdf_url":"https://arxiv.org/pdf/2501.08285v1.pdf","comment":"12 pages, 11 figures, VISAPP 2025 camera ready"},{"id":"http://arxiv.org/abs/2501.08282v1","updated":"2025-01-14T17:58:12Z","published":"2025-01-14T17:58:12Z","title":"LLaVA-ST: A Multimodal Large Language Model for Fine-Grained\n Spatial-Temporal Understanding","summary":" Recent advancements in multimodal large language models (MLLMs) have shown\npromising results, yet existing approaches struggle to effectively handle both\ntemporal and spatial localization simultaneously. This challenge stems from two\nkey issues: first, incorporating spatial-temporal localization introduces a\nvast number of coordinate combinations, complicating the alignment of\nlinguistic and visual coordinate representations; second, encoding fine-grained\ntemporal and spatial information during video feature compression is inherently\ndifficult. To address these issues, we propose LLaVA-ST, a MLLM for\nfine-grained spatial-temporal multimodal understanding. In LLaVA-ST, we propose\nLanguage-Aligned Positional Embedding, which embeds the textual coordinate\nspecial token into the visual space, simplifying the alignment of fine-grained\nspatial-temporal correspondences. Additionally, we design the Spatial-Temporal\nPacker, which decouples the feature compression of temporal and spatial\nresolutions into two distinct point-to-region attention processing streams.\nFurthermore, we propose ST-Align dataset with 4.3M training samples for\nfine-grained spatial-temporal multimodal understanding. With ST-align, we\npresent a progressive training pipeline that aligns the visual and textual\nfeature through sequential coarse-to-fine stages.Additionally, we introduce an\nST-Align benchmark to evaluate spatial-temporal interleaved fine-grained\nunderstanding tasks, which include Spatial-Temporal Video Grounding (STVG) ,\nEvent Localization and Captioning (ELC) and Spatial Video Grounding (SVG).\nLLaVA-ST achieves outstanding performance on 11 benchmarks requiring\nfine-grained temporal, spatial, or spatial-temporal interleaving multimodal\nunderstanding. Our code, data and benchmark will be released at Our code, data\nand benchmark will be released at https://github.com/appletea233/LLaVA-ST .\n","authors":["Hongyu Li","Jinyu Chen","Ziyu Wei","Shaofei Huang","Tianrui Hui","Jialin Gao","Xiaoming Wei","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08279v1","updated":"2025-01-14T17:55:12Z","published":"2025-01-14T17:55:12Z","title":"SmartEraser: Remove Anything from Images using Masked-Region Guidance","summary":" Object removal has so far been dominated by the mask-and-inpaint paradigm,\nwhere the masked region is excluded from the input, leaving models relying on\nunmasked areas to inpaint the missing region. However, this approach lacks\ncontextual information for the masked area, often resulting in unstable\nperformance. In this work, we introduce SmartEraser, built with a new removing\nparadigm called Masked-Region Guidance. This paradigm retains the masked region\nin the input, using it as guidance for the removal process. It offers several\ndistinct advantages: (a) it guides the model to accurately identify the object\nto be removed, preventing its regeneration in the output; (b) since the user\nmask often extends beyond the object itself, it aids in preserving the\nsurrounding context in the final result. Leveraging this new paradigm, we\npresent Syn4Removal, a large-scale object removal dataset, where instance\nsegmentation data is used to copy and paste objects onto images as removal\ntargets, with the original images serving as ground truths. Experimental\nresults demonstrate that SmartEraser significantly outperforms existing\nmethods, achieving superior performance in object removal, especially in\ncomplex scenes with intricate compositions.\n","authors":["Longtao Jiang","Zhendong Wang","Jianmin Bao","Wengang Zhou","Dongdong Chen","Lei Shi","Dong Chen","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2501.08279v1.pdf","comment":"Project at: https://longtaojiang.github.io/smarteraser.github.io/"},{"id":"http://arxiv.org/abs/2406.08476v2","updated":"2025-01-14T17:46:01Z","published":"2024-06-12T17:59:04Z","title":"RMem: Restricted Memory Banks Improve Video Object Segmentation","summary":" With recent video object segmentation (VOS) benchmarks evolving to\nchallenging scenarios, we revisit a simple but overlooked strategy: restricting\nthe size of memory banks. This diverges from the prevalent practice of\nexpanding memory banks to accommodate extensive historical information. Our\nspecially designed \"memory deciphering\" study offers a pivotal insight\nunderpinning such a strategy: expanding memory banks, while seemingly\nbeneficial, actually increases the difficulty for VOS modules to decode\nrelevant features due to the confusion from redundant information. By\nrestricting memory banks to a limited number of essential frames, we achieve a\nnotable improvement in VOS accuracy. This process balances the importance and\nfreshness of frames to maintain an informative memory bank within a bounded\ncapacity. Additionally, restricted memory banks reduce the training-inference\ndiscrepancy in memory lengths compared with continuous expansion. This fosters\nnew opportunities in temporal reasoning and enables us to introduce the\npreviously overlooked \"temporal positional embedding.\" Finally, our insights\nare embodied in \"RMem\" (\"R\" for restricted), a simple yet effective VOS\nmodification that excels at challenging VOS scenarios and establishes new state\nof the art for object state changes (on the VOST dataset) and long videos (on\nthe Long Videos dataset). Our code and demo are available at\nhttps://restricted-memory.github.io/.\n","authors":["Junbao Zhou","Ziqi Pang","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2406.08476v2.pdf","comment":"CVPR 2024, Project Page: https://restricted-memory.github.io/"},{"id":"http://arxiv.org/abs/2409.07571v3","updated":"2025-01-14T17:33:46Z","published":"2024-09-11T18:58:16Z","title":"FaVoR: Features via Voxel Rendering for Camera Relocalization","summary":" Camera relocalization methods range from dense image alignment to direct\ncamera pose regression from a query image. Among these, sparse feature matching\nstands out as an efficient, versatile, and generally lightweight approach with\nnumerous applications. However, feature-based methods often struggle with\nsignificant viewpoint and appearance changes, leading to matching failures and\ninaccurate pose estimates. To overcome this limitation, we propose a novel\napproach that leverages a globally sparse yet locally dense 3D representation\nof 2D features. By tracking and triangulating landmarks over a sequence of\nframes, we construct a sparse voxel map optimized to render image patch\ndescriptors observed during tracking. Given an initial pose estimate, we first\nsynthesize descriptors from the voxels using volumetric rendering and then\nperform feature matching to estimate the camera pose. This methodology enables\nthe generation of descriptors for unseen views, enhancing robustness to view\nchanges. We extensively evaluate our method on the 7-Scenes and Cambridge\nLandmarks datasets. Our results show that our method significantly outperforms\nexisting state-of-the-art feature representation techniques in indoor\nenvironments, achieving up to a 39% improvement in median translation error.\nAdditionally, our approach yields comparable results to other methods for\noutdoor scenarios while maintaining lower memory and computational costs.\n","authors":["Vincenzo Polizzi","Marco Cannici","Davide Scaramuzza","Jonathan Kelly"],"pdf_url":"https://arxiv.org/pdf/2409.07571v3.pdf","comment":"Accepted to the IEEE/CVF Winter Conference on Applications of\n Computer Vision (WACV), Tucson, Arizona, US, Feb 28-Mar 4, 2025"},{"id":"http://arxiv.org/abs/2501.06693v2","updated":"2025-01-14T17:29:06Z","published":"2025-01-12T03:01:15Z","title":"Vid2Sim: Realistic and Interactive Simulation from Video for Urban\n Navigation","summary":" Sim-to-real gap has long posed a significant challenge for robot learning in\nsimulation, preventing the deployment of learned models in the real world.\nPrevious work has primarily focused on domain randomization and system\nidentification to mitigate this gap. However, these methods are often limited\nby the inherent constraints of the simulation and graphics engines. In this\nwork, we propose Vid2Sim, a novel framework that effectively bridges the\nsim2real gap through a scalable and cost-efficient real2sim pipeline for neural\n3D scene reconstruction and simulation. Given a monocular video as input,\nVid2Sim can generate photorealistic and physically interactable 3D simulation\nenvironments to enable the reinforcement learning of visual navigation agents\nin complex urban environments. Extensive experiments demonstrate that Vid2Sim\nsignificantly improves the performance of urban navigation in the digital twins\nand real world by 31.2% and 68.3% in success rate compared with agents trained\nwith prior simulation methods.\n","authors":["Ziyang Xie","Zhizheng Liu","Zhenghao Peng","Wayne Wu","Bolei Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.06693v2.pdf","comment":"Project page: https://metadriverse.github.io/vid2sim/"},{"id":"http://arxiv.org/abs/2501.08266v1","updated":"2025-01-14T17:26:02Z","published":"2025-01-14T17:26:02Z","title":"AI Driven Water Segmentation with deep learning models for Enhanced\n Flood Monitoring","summary":" Flooding is a major natural hazard causing significant fatalities and\neconomic losses annually, with increasing frequency due to climate change.\nRapid and accurate flood detection and monitoring are crucial for mitigating\nthese impacts. This study compares the performance of three deep learning\nmodels UNet, ResNet, and DeepLabv3 for pixelwise water segmentation to aid in\nflood detection, utilizing images from drones, in field observations, and\nsocial media. This study involves creating a new dataset that augments\nwellknown benchmark datasets with flood-specific images, enhancing the\nrobustness of the models. The UNet, ResNet, and DeepLab v3 architectures are\ntested to determine their effectiveness in various environmental conditions and\ngeographical locations, and the strengths and limitations of each model are\nalso discussed here, providing insights into their applicability in different\nscenarios by predicting image segmentation masks. This fully automated approach\nallows these models to isolate flooded areas in images, significantly reducing\nprocessing time compared to traditional semi-automated methods. The outcome of\nthis study is to predict segmented masks for each image effected by a flood\ndisaster and the validation accuracy of these models. This methodology\nfacilitates timely and continuous flood monitoring, providing vital data for\nemergency response teams to reduce loss of life and economic damages. It offers\na significant reduction in the time required to generate flood maps, cutting\ndown the manual processing time. Additionally, we present avenues for future\nresearch, including the integration of multimodal data sources and the\ndevelopment of robust deep learning architectures tailored specifically for\nflood detection tasks. Overall, our work contributes to the advancement of\nflood management strategies through innovative use of deep learning\ntechnologies.\n","authors":["Sanjida Afrin Mou","Tasfia Noor Chowdhury","Adib Ibn Mannan","Sadia Nourin Mim","Lubana Tarannum","Tasrin Noman","Jamal Uddin Ahamed"],"pdf_url":"https://arxiv.org/pdf/2501.08266v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.08258v1","updated":"2025-01-14T17:10:02Z","published":"2025-01-14T17:10:02Z","title":"Towards an End-to-End (E2E) Adversarial Learning and Application in the\n Physical World","summary":" The traditional learning process of patch-based adversarial attacks,\nconducted in the digital domain and then applied in the physical domain (e.g.,\nvia printed stickers), may suffer from reduced performance due to adversarial\npatches' limited transferability from the digital domain to the physical\ndomain. Given that previous studies have considered using projectors to apply\nadversarial attacks, we raise the following question: can adversarial learning\n(i.e., patch generation) be performed entirely in the physical domain with a\nprojector? In this work, we propose the Physical-domain Adversarial Patch\nLearning Augmentation (PAPLA) framework, a novel end-to-end (E2E) framework\nthat converts adversarial learning from the digital domain to the physical\ndomain using a projector. We evaluate PAPLA across multiple scenarios,\nincluding controlled laboratory settings and realistic outdoor environments,\ndemonstrating its ability to ensure attack success compared to conventional\ndigital learning-physical application (DL-PA) methods. We also analyze the\nimpact of environmental factors, such as projection surface color, projector\nstrength, ambient light, distance, and angle of the target object relative to\nthe camera, on the effectiveness of projected patches. Finally, we demonstrate\nthe feasibility of the attack against a parked car and a stop sign in a\nreal-world outdoor environment. Our results show that under specific\nconditions, E2E adversarial learning in the physical domain eliminates the\ntransferability issue and ensures evasion by object detectors. Finally, we\nprovide insights into the challenges and opportunities of applying adversarial\nlearning in the physical domain and explain where such an approach is more\neffective than using a sticker.\n","authors":["Dudi Biton","Jacob Shams","Koda Satoru","Asaf Shabtai","Yuval Elovici","Ben Nassi"],"pdf_url":"https://arxiv.org/pdf/2501.08258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10733v4","updated":"2025-01-14T16:47:44Z","published":"2024-10-14T17:15:07Z","title":"Deep Compression Autoencoder for Efficient High-Resolution Diffusion\n Models","summary":" We present Deep Compression Autoencoder (DC-AE), a new family of autoencoder\nmodels for accelerating high-resolution diffusion models. Existing autoencoder\nmodels have demonstrated impressive results at a moderate spatial compression\nratio (e.g., 8x), but fail to maintain satisfactory reconstruction accuracy for\nhigh spatial compression ratios (e.g., 64x). We address this challenge by\nintroducing two key techniques: (1) Residual Autoencoding, where we design our\nmodels to learn residuals based on the space-to-channel transformed features to\nalleviate the optimization difficulty of high spatial-compression autoencoders;\n(2) Decoupled High-Resolution Adaptation, an efficient decoupled three-phases\ntraining strategy for mitigating the generalization penalty of high\nspatial-compression autoencoders. With these designs, we improve the\nautoencoder's spatial compression ratio up to 128 while maintaining the\nreconstruction quality. Applying our DC-AE to latent diffusion models, we\nachieve significant speedup without accuracy drop. For example, on ImageNet\n512x512, our DC-AE provides 19.1x inference speedup and 17.9x training speedup\non H100 GPU for UViT-H while achieving a better FID, compared with the widely\nused SD-VAE-f8 autoencoder. Our code is available at\nhttps://github.com/mit-han-lab/efficientvit.\n","authors":["Junyu Chen","Han Cai","Junsong Chen","Enze Xie","Shang Yang","Haotian Tang","Muyang Li","Yao Lu","Song Han"],"pdf_url":"https://arxiv.org/pdf/2410.10733v4.pdf","comment":"Preprint. First two authors contributed equally to this work. Update:\n add USiT (UViT+SiT sampler) results"},{"id":"http://arxiv.org/abs/2405.20299v4","updated":"2025-01-14T16:38:36Z","published":"2024-05-30T17:46:23Z","title":"Scaling White-Box Transformers for Vision","summary":" CRATE, a white-box transformer architecture designed to learn compressed and\nsparse representations, offers an intriguing alternative to standard vision\ntransformers (ViTs) due to its inherent mathematical interpretability. Despite\nextensive investigations into the scaling behaviors of language and vision\ntransformers, the scalability of CRATE remains an open question which this\npaper aims to address. Specifically, we propose CRATE-$\\alpha$, featuring\nstrategic yet minimal modifications to the sparse coding block in the CRATE\narchitecture design, and a light training recipe designed to improve the\nscalability of CRATE. Through extensive experiments, we demonstrate that\nCRATE-$\\alpha$ can effectively scale with larger model sizes and datasets. For\nexample, our CRATE-$\\alpha$-B substantially outperforms the prior best CRATE-B\nmodel accuracy on ImageNet classification by 3.7%, achieving an accuracy of\n83.2%. Meanwhile, when scaling further, our CRATE-$\\alpha$-L obtains an\nImageNet classification accuracy of 85.1%. More notably, these model\nperformance improvements are achieved while preserving, and potentially even\nenhancing the interpretability of learned CRATE models, as we demonstrate\nthrough showing that the learned token representations of increasingly larger\ntrained CRATE-$\\alpha$ models yield increasingly higher-quality unsupervised\nobject segmentation of images. The project page is\nhttps://rayjryang.github.io/CRATE-alpha/.\n","authors":["Jinrui Yang","Xianhang Li","Druv Pai","Yuyin Zhou","Yi Ma","Yaodong Yu","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2405.20299v4.pdf","comment":"project page: https://rayjryang.github.io/CRATE-alpha/"},{"id":"http://arxiv.org/abs/2501.08245v1","updated":"2025-01-14T16:31:01Z","published":"2025-01-14T16:31:01Z","title":"Continual Deep Active Learning for Medical Imaging: Replay-Base\n Architecture for Context Adaptation","summary":" Deep Learning for medical imaging faces challenges in adapting and\ngeneralizing to new contexts. Additionally, it often lacks sufficient labeled\ndata for specific tasks requiring significant annotation effort. Continual\nLearning (CL) tackles adaptability and generalizability by enabling lifelong\nlearning from a data stream while mitigating forgetting of previously learned\nknowledge. Active Learning (AL) reduces the number of required annotations for\neffective training. This work explores both approaches (CAL) to develop a novel\nframework for robust medical image analysis. Based on the automatic recognition\nof shifts in image characteristics, Replay-Base Architecture for Context\nAdaptation (RBACA) employs a CL rehearsal method to continually learn from\ndiverse contexts, and an AL component to select the most informative instances\nfor annotation. A novel approach to evaluate CAL methods is established using a\ndefined metric denominated IL-Score, which allows for the simultaneous\nassessment of transfer learning, forgetting, and final model performance. We\nshow that RBACA works in domain and class-incremental learning scenarios, by\nassessing its IL-Score on the segmentation and diagnosis of cardiac images. The\nresults show that RBACA outperforms a baseline framework without CAL, and a\nstate-of-the-art CAL method across various memory sizes and annotation budgets.\nOur code is available in https://github.com/RuiDaniel/RBACA .\n","authors":["Rui Daniel","M. Rita Verdelho","Catarina Barata","Carlos Santiago"],"pdf_url":"https://arxiv.org/pdf/2501.08245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08241v1","updated":"2025-01-14T16:28:02Z","published":"2025-01-14T16:28:02Z","title":"A Feature-Level Ensemble Model for COVID-19 Identification in CXR Images\n using Choquet Integral and Differential Evolution Optimization","summary":" The COVID-19 pandemic has profoundly impacted billions globally. It\nchallenges public health and healthcare systems due to its rapid spread and\nsevere respiratory effects. An effective strategy to mitigate the COVID-19\npandemic involves integrating testing to identify infected individuals. While\nRT-PCR is considered the gold standard for diagnosing COVID-19, it has some\nlimitations such as the risk of false negatives. To address this problem, this\npaper introduces a novel Deep Learning Diagnosis System that integrates\npre-trained Deep Convolutional Neural Networks (DCNNs) within an ensemble\nlearning framework to achieve precise identification of COVID-19 cases from\nChest X-ray (CXR) images. We combine feature vectors from the final hidden\nlayers of pre-trained DCNNs using the Choquet integral to capture interactions\nbetween different DCNNs that a linear approach cannot. We employed\nSugeno-$\\lambda$ measure theory to derive fuzzy measures for subsets of\nnetworks to enable aggregation. We utilized Differential Evolution to estimate\nfuzzy densities. We developed a TensorFlow-based layer for Choquet operation to\nfacilitate efficient aggregation, due to the intricacies involved in\naggregating feature vectors. Experimental results on the COVIDx dataset show\nthat our ensemble model achieved 98\\% accuracy in three-class classification\nand 99.50\\% in binary classification, outperforming its components-DenseNet-201\n(97\\% for three-class, 98.75\\% for binary), Inception-v3 (96.25\\% for\nthree-class, 98.50\\% for binary), and Xception (94.50\\% for three-class, 98\\%\nfor binary)-and surpassing many previous methods.\n","authors":["Amir Reza Takhsha","Maryam Rastgarpour","Mozhgan Naderi"],"pdf_url":"https://arxiv.org/pdf/2501.08241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10729v2","updated":"2025-01-14T16:17:00Z","published":"2024-06-15T20:04:06Z","title":"A Comprehensive Survey of Foundation Models in Medicine","summary":" Foundation models (FMs) are large-scale deep learning models that are\ndeveloped using large datasets and self-supervised learning methods. These\nmodels serve as a base for different downstream tasks, including healthcare.\nFMs have been adopted with great success across various domains within\nhealthcare. Existing healthcare-based surveys have not yet included all of\nthese domains. Therefore, we provide a detailed survey of FMs in healthcare. We\nfocus on the history, learning strategies, flagship models, applications, and\nchallenges of FMs. We explore how FMs such as the BERT and GPT families are\nreshaping various healthcare domains, including clinical large language models,\nmedical image analysis, and omics. Furthermore, we provide a detailed taxonomy\nof healthcare applications facilitated by FMs, such as clinical NLP, medical\ncomputer vision, graph learning, and other biology-related tasks. Despite the\npromising opportunities FMs provide, they also have several associated\nchallenges, which are explained in detail. We also outline open research issues\nand potential lessons learned to provide researchers and practitioners with\ninsights into the capabilities of FMs in healthcare to advance their deployment\nand mitigate associated risks.\n","authors":["Wasif Khan","Seowung Leem","Kyle B. See","Joshua K. Wong","Shaoting Zhang","Ruogu Fang"],"pdf_url":"https://arxiv.org/pdf/2406.10729v2.pdf","comment":"Currently under review in IEEE REVIEWS IN BIOMEDICAL ENGINEERING"},{"id":"http://arxiv.org/abs/2307.09059v3","updated":"2025-01-14T16:11:11Z","published":"2023-07-18T08:23:46Z","title":"Text-guided Image Restoration and Semantic Enhancement for Text-to-Image\n Person Retrieval","summary":" The goal of Text-to-Image Person Retrieval (TIPR) is to retrieve specific\nperson images according to the given textual descriptions. A primary challenge\nin this task is bridging the substantial representational gap between visual\nand textual modalities. The prevailing methods map texts and images into\nunified embedding space for matching, while the intricate semantic\ncorrespondences between texts and images are still not effectively constructed.\nTo address this issue, we propose a novel TIPR framework to build fine-grained\ninteractions and alignment between person images and the corresponding texts.\nSpecifically, via fine-tuning the Contrastive Language-Image Pre-training\n(CLIP) model, a visual-textual dual encoder is firstly constructed, to\npreliminarily align the image and text features. Secondly, a Text-guided Image\nRestoration (TIR) auxiliary task is proposed to map abstract textual entities\nto specific image regions, improving the alignment between local textual and\nvisual embeddings. Additionally, a cross-modal triplet loss is presented to\nhandle hard samples, and further enhance the model's discriminability for minor\ndifferences. Moreover, a pruning-based text data augmentation approach is\nproposed to enhance focus on essential elements in descriptions, thereby\navoiding excessive model attention to less significant information. The\nexperimental results show our proposed method outperforms state-of-the-art\nmethods on three popular benchmark datasets, and the code will be made publicly\navailable at https://github.com/Delong-liu-bupt/SEN.\n","authors":["Delong Liu","Haiwen Li","Zhicheng Zhao","Yuan Dong","Nikolaos V. Boulgouris"],"pdf_url":"https://arxiv.org/pdf/2307.09059v3.pdf","comment":"The paper was withdrawn due to a dispute among the authors regarding\n the content of the article"},{"id":"http://arxiv.org/abs/2501.08226v1","updated":"2025-01-14T16:10:25Z","published":"2025-01-14T16:10:25Z","title":"Efficient Deep Learning-based Forward Solvers for Brain Tumor Growth\n Models","summary":" Glioblastoma, a highly aggressive brain tumor, poses major challenges due to\nits poor prognosis and high morbidity rates. Partial differential\nequation-based models offer promising potential to enhance therapeutic outcomes\nby simulating patient-specific tumor behavior for improved radiotherapy\nplanning. However, model calibration remains a bottleneck due to the high\ncomputational demands of optimization methods like Monte Carlo sampling and\nevolutionary algorithms. To address this, we recently introduced an approach\nleveraging a neural forward solver with gradient-based optimization to\nsignificantly reduce calibration time. This approach requires a highly accurate\nand fully differentiable forward model. We investigate multiple architectures,\nincluding (i) an enhanced TumorSurrogate, (ii) a modified nnU-Net, and (iii) a\n3D Vision Transformer (ViT). The optimized TumorSurrogate achieved the best\noverall results, excelling in both tumor outline matching and voxel-level\nprediction of tumor cell concentration. It halved the MSE relative to the\nbaseline model and achieved the highest Dice score across all tumor cell\nconcentration thresholds. Our study demonstrates significant enhancement in\nforward solver performance and outlines important future research directions.\n","authors":["Zeineb Haouari","Jonas Weidner","Ivan Ezhov","Aswathi Varma","Daniel Rueckert","Bjoern Menze","Benedikt Wiestler"],"pdf_url":"https://arxiv.org/pdf/2501.08226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08225v1","updated":"2025-01-14T16:09:16Z","published":"2025-01-14T16:09:16Z","title":"FramePainter: Endowing Interactive Image Editing with Video Diffusion\n Priors","summary":" Interactive image editing allows users to modify images through visual\ninteraction operations such as drawing, clicking, and dragging. Existing\nmethods construct such supervision signals from videos, as they capture how\nobjects change with various physical interactions. However, these models are\nusually built upon text-to-image diffusion models, so necessitate (i) massive\ntraining samples and (ii) an additional reference encoder to learn real-world\ndynamics and visual consistency. In this paper, we reformulate this task as an\nimage-to-video generation problem, so that inherit powerful video diffusion\npriors to reduce training costs and ensure temporal consistency. Specifically,\nwe introduce FramePainter as an efficient instantiation of this formulation.\nInitialized with Stable Video Diffusion, it only uses a lightweight sparse\ncontrol encoder to inject editing signals. Considering the limitations of\ntemporal attention in handling large motion between two frames, we further\npropose matching attention to enlarge the receptive field while encouraging\ndense correspondence between edited and source image tokens. We highlight the\neffectiveness and efficiency of FramePainter across various of editing signals:\nit domainantly outperforms previous state-of-the-art methods with far less\ntraining data, achieving highly seamless and coherent editing of images, \\eg,\nautomatically adjust the reflection of the cup. Moreover, FramePainter also\nexhibits exceptional generalization in scenarios not present in real-world\nvideos, \\eg, transform the clownfish into shark-like shape. Our code will be\navailable at https://github.com/YBYBZhang/FramePainter.\n","authors":["Yabo Zhang","Xinpeng Zhou","Yihan Zeng","Hang Xu","Hui Li","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2501.08225v1.pdf","comment":"Code: https://github.com/YBYBZhang/FramePainter"},{"id":"http://arxiv.org/abs/2408.12454v3","updated":"2025-01-14T15:35:55Z","published":"2024-08-22T14:52:53Z","title":"Relaxed Rotational Equivariance via $G$-Biases in Vision","summary":" Group Equivariant Convolution (GConv) can capture rotational equivariance\nfrom original data. It assumes uniform and strict rotational equivariance\nacross all features as the transformations under the specific group. However,\nthe presentation or distribution of real-world data rarely conforms to strict\nrotational equivariance, commonly referred to as Rotational Symmetry-Breaking\n(RSB) in the system or dataset, making GConv unable to adapt effectively to\nthis phenomenon. Motivated by this, we propose a simple but highly effective\nmethod to address this problem, which utilizes a set of learnable biases called\n$G$-Biases under the group order to break strict group constraints and then\nachieve a Relaxed Rotational Equivariant Convolution (RREConv). To validate the\nefficiency of RREConv, we conduct extensive ablation experiments on the\ndiscrete rotational group $\\mathcal{C}_n$. Experiments demonstrate that the\nproposed RREConv-based methods achieve excellent performance compared to\nexisting GConv-based methods in both classification and 2D object detection\ntasks on the natural image datasets.\n","authors":["Zhiqiang Wu","Yingjie Liu","Licheng Sun","Jian Yang","Hanlin Dong","Shing-Ho J. Lin","Xuan Tang","Jinpeng Mi","Bo Jin","Xian Wei"],"pdf_url":"https://arxiv.org/pdf/2408.12454v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08199v1","updated":"2025-01-14T15:23:36Z","published":"2025-01-14T15:23:36Z","title":"EmoNeXt: an Adapted ConvNeXt for Facial Emotion Recognition","summary":" Facial expressions play a crucial role in human communication serving as a\npowerful and impactful means to express a wide range of emotions. With\nadvancements in artificial intelligence and computer vision, deep neural\nnetworks have emerged as effective tools for facial emotion recognition. In\nthis paper, we propose EmoNeXt, a novel deep learning framework for facial\nexpression recognition based on an adapted ConvNeXt architecture network. We\nintegrate a Spatial Transformer Network (STN) to focus on feature-rich regions\nof the face and Squeeze-and-Excitation blocks to capture channel-wise\ndependencies. Moreover, we introduce a self-attention regularization term,\nencouraging the model to generate compact feature vectors. We demonstrate the\nsuperiority of our model over existing state-of-the-art deep learning models on\nthe FER2013 dataset regarding emotion classification accuracy.\n","authors":["Yassine El Boudouri","Amine Bohi"],"pdf_url":"https://arxiv.org/pdf/2501.08199v1.pdf","comment":"6 pages, 5 figures and 2 tables. 2023 IEEE 25th International\n Workshop on Multimedia Signal Processing (MMSP), Poitiers, France"},{"id":"http://arxiv.org/abs/2501.08195v1","updated":"2025-01-14T15:18:28Z","published":"2025-01-14T15:18:28Z","title":"Self-supervised Deep Hyperspectral Inpainting with the Plug and Play and\n Deep Image Prior Models","summary":" Hyperspectral images are typically composed of hundreds of narrow and\ncontiguous spectral bands, each containing information regarding the material\ncomposition of the imaged scene. However, these images can be affected by\nvarious sources of noise, distortions, or data loss, which can significantly\ndegrade their quality and usefulness. This paper introduces a convergent\nguaranteed algorithm, LRS-PnP-DIP(1-Lip), which successfully addresses the\ninstability issue of DHP that has been reported before. The proposed algorithm\nextends the successful joint low-rank and sparse model to further exploit the\nunderlying data structures beyond the conventional and sometimes restrictive\nunions of subspace models. A stability analysis guarantees the convergence of\nthe proposed algorithm under mild assumptions , which is crucial for its\napplication in real-world scenarios. Extensive experiments demonstrate that the\nproposed solution consistently delivers visually and quantitatively superior\ninpainting results, establishing state-of-the-art performance.\n","authors":["Shuo Li","Mehrdad Yaghoobi"],"pdf_url":"https://arxiv.org/pdf/2501.08195v1.pdf","comment":"31 pages, 9 Figures, 7 Tables. arXiv admin note: text overlap with\n arXiv:2306.08128"},{"id":"http://arxiv.org/abs/2501.08188v1","updated":"2025-01-14T15:13:00Z","published":"2025-01-14T15:13:00Z","title":"A Critical Synthesis of Uncertainty Quantification and Foundation Models\n in Monocular Depth Estimation","summary":" While recent foundation models have enabled significant breakthroughs in\nmonocular depth estimation, a clear path towards safe and reliable deployment\nin the real-world remains elusive. Metric depth estimation, which involves\npredicting absolute distances, poses particular challenges, as even the most\nadvanced foundation models remain prone to critical errors. Since quantifying\nthe uncertainty has emerged as a promising endeavor to address these\nlimitations and enable trustworthy deployment, we fuse five different\nuncertainty quantification methods with the current state-of-the-art\nDepthAnythingV2 foundation model. To cover a wide range of metric depth\ndomains, we evaluate their performance on four diverse datasets. Our findings\nidentify fine-tuning with the Gaussian Negative Log-Likelihood Loss (GNLL) as a\nparticularly promising approach, offering reliable uncertainty estimates while\nmaintaining predictive performance and computational efficiency on par with the\nbaseline, encompassing both training and inference time. By fusing uncertainty\nquantification and foundation models within the context of monocular depth\nestimation, this paper lays a critical foundation for future research aimed at\nimproving not only model performance but also its explainability. Extending\nthis critical synthesis of uncertainty quantification and foundation models\ninto other crucial tasks, such as semantic segmentation and pose estimation,\npresents exciting opportunities for safer and more reliable machine vision\nsystems.\n","authors":["Steven Landgraf","Rongjun Qin","Markus Ulrich"],"pdf_url":"https://arxiv.org/pdf/2501.08188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08182v1","updated":"2025-01-14T15:08:56Z","published":"2025-01-14T15:08:56Z","title":"CG-MER: A Card Game-based Multimodal dataset for Emotion Recognition","summary":" The field of affective computing has seen significant advancements in\nexploring the relationship between emotions and emerging technologies. This\npaper presents a novel and valuable contribution to this field with the\nintroduction of a comprehensive French multimodal dataset designed specifically\nfor emotion recognition. The dataset encompasses three primary modalities:\nfacial expressions, speech, and gestures, providing a holistic perspective on\nemotions. Moreover, the dataset has the potential to incorporate additional\nmodalities, such as Natural Language Processing (NLP) to expand the scope of\nemotion recognition research. The dataset was curated through engaging\nparticipants in card game sessions, where they were prompted to express a range\nof emotions while responding to diverse questions. The study included 10\nsessions with 20 participants (9 females and 11 males). The dataset serves as a\nvaluable resource for furthering research in emotion recognition and provides\nan avenue for exploring the intricate connections between human emotions and\ndigital technologies.\n","authors":["Nessrine Farhat","Amine Bohi","Leila Ben Letaifa","Rim Slama"],"pdf_url":"https://arxiv.org/pdf/2501.08182v1.pdf","comment":"8 pages, 2 figures and 4 tables. Sixteenth International Conference\n on Machine Vision (ICMV 2023), Yerevan, Armenia"},{"id":"http://arxiv.org/abs/2501.08180v1","updated":"2025-01-14T15:03:53Z","published":"2025-01-14T15:03:53Z","title":"D$^2$-DPM: Dual Denoising for Quantized Diffusion Probabilistic Models","summary":" Diffusion models have achieved cutting-edge performance in image generation.\nHowever, their lengthy denoising process and computationally intensive score\nestimation network impede their scalability in low-latency and\nresource-constrained scenarios. Post-training quantization (PTQ) compresses and\naccelerates diffusion models without retraining, but it inevitably introduces\nadditional quantization noise, resulting in mean and variance deviations. In\nthis work, we propose D2-DPM, a dual denoising mechanism aimed at precisely\nmitigating the adverse effects of quantization noise on the noise estimation\nnetwork. Specifically, we first unravel the impact of quantization noise on the\nsampling equation into two components: the mean deviation and the variance\ndeviation. The mean deviation alters the drift coefficient of the sampling\nequation, influencing the trajectory trend, while the variance deviation\nmagnifies the diffusion coefficient, impacting the convergence of the sampling\ntrajectory. The proposed D2-DPM is thus devised to denoise the quantization\nnoise at each time step, and then denoise the noisy sample through the inverse\ndiffusion iterations. Experimental results demonstrate that D2-DPM achieves\nsuperior generation quality, yielding a 1.42 lower FID than the full-precision\nmodel while achieving 3.99x compression and 11.67x bit-operation acceleration.\n","authors":["Qian Zeng","Jie Song","Han Zheng","Hao Jiang","Mingli Song"],"pdf_url":"https://arxiv.org/pdf/2501.08180v1.pdf","comment":"9 pages, 4 figures, acceptted by AAAI2025"},{"id":"http://arxiv.org/abs/2501.08174v1","updated":"2025-01-14T14:56:31Z","published":"2025-01-14T14:56:31Z","title":"Object-Centric 2D Gaussian Splatting: Background Removal and\n Occlusion-Aware Pruning for Compact Object Models","summary":" Current Gaussian Splatting approaches are effective for reconstructing entire\nscenes but lack the option to target specific objects, making them\ncomputationally expensive and unsuitable for object-specific applications. We\npropose a novel approach that leverages object masks to enable targeted\nreconstruction, resulting in object-centric models. Additionally, we introduce\nan occlusion-aware pruning strategy to minimize the number of Gaussians without\ncompromising quality. Our method reconstructs compact object models, yielding\nobject-centric Gaussian and mesh representations that are up to 96\\% smaller\nand up to 71\\% faster to train compared to the baseline while retaining\ncompetitive quality. These representations are immediately usable for\ndownstream applications such as appearance editing and physics simulation\nwithout additional processing.\n","authors":["Marcel Rogge","Didier Stricker"],"pdf_url":"https://arxiv.org/pdf/2501.08174v1.pdf","comment":"Accepted at ICPRAM 2025 (https://icpram.scitevents.org/Home.aspx)"},{"id":"http://arxiv.org/abs/2411.19835v2","updated":"2025-01-14T14:53:10Z","published":"2024-11-29T16:45:25Z","title":"Feedback-driven object detection and iterative model improvement","summary":" Automated object detection has become increasingly valuable across diverse\napplications, yet efficient, high-quality annotation remains a persistent\nchallenge. In this paper, we present the development and evaluation of a\nplatform designed to interactively improve object detection models. The\nplatform allows uploading and annotating images as well as fine-tuning object\ndetection models. Users can then manually review and refine annotations,\nfurther creating improved snapshots that are used for automatic object\ndetection on subsequent image uploads - a process we refer to as semi-automatic\nannotation resulting in a significant gain in annotation efficiency.\n Whereas iterative refinement of model results to speed up annotation has\nbecome common practice, we are the first to quantitatively evaluate its\nbenefits with respect to time, effort, and interaction savings. Our\nexperimental results show clear evidence for a significant time reduction of up\nto 53% for semi-automatic compared to manual annotation. Importantly, these\nefficiency gains did not compromise annotation quality, while matching or\noccasionally even exceeding the accuracy of manual annotations. These findings\ndemonstrate the potential of our lightweight annotation platform for creating\nhigh-quality object detection datasets and provide best practices to guide\nfuture development of annotation platforms.\n The platform is open-source, with the frontend and backend repositories\navailable on GitHub (https://github.com/ml-lab-htw/iterative-annotate). To\nsupport the understanding of our labeling process, we have created an\nexplanatory video demonstrating the methodology using microscopy images of E.\ncoli bacteria as an example. The video is available on YouTube\n(https://www.youtube.com/watch?v=CM9uhE8NN5E).\n","authors":["Sönke Tenckhoff","Mario Koddenbrock","Erik Rodner"],"pdf_url":"https://arxiv.org/pdf/2411.19835v2.pdf","comment":"AI4EA24"},{"id":"http://arxiv.org/abs/2501.08170v1","updated":"2025-01-14T14:50:57Z","published":"2025-01-14T14:50:57Z","title":"Benchmarking Multimodal Models for Fine-Grained Image Analysis: A\n Comparative Study Across Diverse Visual Features","summary":" This article introduces a benchmark designed to evaluate the capabilities of\nmultimodal models in analyzing and interpreting images. The benchmark focuses\non seven key visual aspects: main object, additional objects, background,\ndetail, dominant colors, style, and viewpoint. A dataset of 14,580 images,\ngenerated from diverse text prompts, was used to assess the performance of\nseven leading multimodal models. These models were evaluated on their ability\nto accurately identify and describe each visual aspect, providing insights into\ntheir strengths and weaknesses for comprehensive image understanding. The\nfindings of this benchmark have significant implications for the development\nand selection of multimodal models for various image analysis tasks.\n","authors":["Evgenii Evstafev"],"pdf_url":"https://arxiv.org/pdf/2501.08170v1.pdf","comment":"6 pages, 2 tables, 2 charts"},{"id":"http://arxiv.org/abs/2501.08169v1","updated":"2025-01-14T14:49:49Z","published":"2025-01-14T14:49:49Z","title":"Revolutionizing Communication with Deep Learning and XAI for Enhanced\n Arabic Sign Language Recognition","summary":" This study introduces an integrated approach to recognizing Arabic Sign\nLanguage (ArSL) using state-of-the-art deep learning models such as\nMobileNetV3, ResNet50, and EfficientNet-B2. These models are further enhanced\nby explainable AI (XAI) techniques to boost interpretability. The ArSL2018 and\nRGB Arabic Alphabets Sign Language (AASL) datasets are employed, with\nEfficientNet-B2 achieving peak accuracies of 99.48\\% and 98.99\\%, respectively.\nKey innovations include sophisticated data augmentation methods to mitigate\nclass imbalance, implementation of stratified 5-fold cross-validation for\nbetter generalization, and the use of Grad-CAM for clear model decision\ntransparency. The proposed system not only sets new benchmarks in recognition\naccuracy but also emphasizes interpretability, making it suitable for\napplications in healthcare, education, and inclusive communication\ntechnologies.\n","authors":["Mazen Balat","Rewaa Awaad","Ahmed B. Zaky","Salah A. Aly"],"pdf_url":"https://arxiv.org/pdf/2501.08169v1.pdf","comment":"13 pages, 25 figures, 16 tables"},{"id":"http://arxiv.org/abs/2412.13174v2","updated":"2025-01-14T14:48:32Z","published":"2024-12-17T18:53:43Z","title":"ORFormer: Occlusion-Robust Transformer for Accurate Facial Landmark\n Detection","summary":" Although facial landmark detection (FLD) has gained significant progress,\nexisting FLD methods still suffer from performance drops on partially\nnon-visible faces, such as faces with occlusions or under extreme lighting\nconditions or poses. To address this issue, we introduce ORFormer, a novel\ntransformer-based method that can detect non-visible regions and recover their\nmissing features from visible parts. Specifically, ORFormer associates each\nimage patch token with one additional learnable token called the messenger\ntoken. The messenger token aggregates features from all but its patch. This\nway, the consensus between a patch and other patches can be assessed by\nreferring to the similarity between its regular and messenger embeddings,\nenabling non-visible region identification. Our method then recovers occluded\npatches with features aggregated by the messenger tokens. Leveraging the\nrecovered features, ORFormer compiles high-quality heatmaps for the downstream\nFLD task. Extensive experiments show that our method generates heatmaps\nresilient to partial occlusions. By integrating the resultant heatmaps into\nexisting FLD methods, our method performs favorably against the state of the\narts on challenging datasets such as WFLW and COFW.\n","authors":["Jui-Che Chiang","Hou-Ning Hu","Bo-Syuan Hou","Chia-Yu Tseng","Yu-Lun Liu","Min-Hung Chen","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2412.13174v2.pdf","comment":"WACV 2025 Project Link: https://ben0919.github.io/ORFormer/"},{"id":"http://arxiv.org/abs/2501.08163v1","updated":"2025-01-14T14:41:51Z","published":"2025-01-14T14:41:51Z","title":"DM-Mamba: Dual-domain Multi-scale Mamba for MRI reconstruction","summary":" The accelerated MRI reconstruction poses a challenging ill-posed inverse\nproblem due to the significant undersampling in k-space. Deep neural networks,\nsuch as CNNs and ViT, have shown substantial performance improvements for this\ntask while encountering the dilemma between global receptive fields and\nefficient computation. To this end, this paper pioneers exploring Mamba, a new\nparadigm for long-range dependency modeling with linear complexity, for\nefficient and effective MRI reconstruction. However, directly applying Mamba to\nMRI reconstruction faces three significant issues: (1) Mamba's row-wise and\ncolumn-wise scanning disrupts k-space's unique spectrum, leaving its potential\nin k-space learning unexplored. (2) Existing Mamba methods unfold feature maps\nwith multiple lengthy scanning paths, leading to long-range forgetting and high\ncomputational burden. (3) Mamba struggles with spatially-varying contents,\nresulting in limited diversity of local representations. To address these, we\npropose a dual-domain multi-scale Mamba for MRI reconstruction from the\nfollowing perspectives: (1) We pioneer vision Mamba in k-space learning. A\ncircular scanning is customized for spectrum unfolding, benefiting the global\nmodeling of k-space. (2) We propose a multi-scale Mamba with an efficient\nscanning strategy in both image and k-space domains. It mitigates long-range\nforgetting and achieves a better trade-off between efficiency and performance.\n(3) We develop a local diversity enhancement module to improve the\nspatially-varying representation of Mamba. Extensive experiments are conducted\non three public datasets for MRI reconstruction under various undersampling\npatterns. Comprehensive results demonstrate that our method significantly\noutperforms state-of-the-art methods with lower computational cost.\nImplementation code will be available at\nhttps://github.com/XiaoMengLiLiLi/DM-Mamba.\n","authors":["Yucong Meng","Zhiwei Yang","Zhijian Song","Yonghong Shi"],"pdf_url":"https://arxiv.org/pdf/2501.08163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06746v2","updated":"2025-01-14T14:40:35Z","published":"2025-01-12T08:04:52Z","title":"Diversified Augmentation with Domain Adaptation for Debiased Video\n Temporal Grounding","summary":" Temporal sentence grounding in videos (TSGV) faces challenges due to public\nTSGV datasets containing significant temporal biases, which are attributed to\nthe uneven temporal distributions of target moments. Existing methods generate\naugmented videos, where target moments are forced to have varying temporal\nlocations. However, since the video lengths of the given datasets have small\nvariations, only changing the temporal locations results in poor generalization\nability in videos with varying lengths. In this paper, we propose a novel\ntraining framework complemented by diversified data augmentation and a domain\ndiscriminator. The data augmentation generates videos with various lengths and\ntarget moment locations to diversify temporal distributions. However, augmented\nvideos inevitably exhibit distinct feature distributions which may introduce\nnoise. To address this, we design a domain adaptation auxiliary task to\ndiminish feature discrepancies between original and augmented videos. We also\nencourage the model to produce distinct predictions for videos with the same\ntext queries but different moment locations to promote debiased training.\nExperiments on Charades-CD and ActivityNet-CD datasets demonstrate the\neffectiveness and generalization abilities of our method in multiple grounding\nstructures, achieving state-of-the-art results.\n","authors":["Junlong Ren","Gangjian Zhang","Haifeng Sun","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2501.06746v2.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.06040v2","updated":"2025-01-14T14:33:55Z","published":"2025-01-10T15:18:05Z","title":"MSCViT: A Small-size ViT architecture with Multi-Scale Self-Attention\n Mechanism for Tiny Datasets","summary":" Vision Transformer (ViT) has demonstrated significant potential in various\nvision tasks due to its strong ability in modelling long-range dependencies.\nHowever, such success is largely fueled by training on massive samples. In real\napplications, the large-scale datasets are not always available, and ViT\nperforms worse than Convolutional Neural Networks (CNNs) if it is only trained\non small scale dataset (called tiny dataset), since it requires large amount of\ntraining data to ensure its representational capacity. In this paper, a\nsmall-size ViT architecture with multi-scale self-attention mechanism and\nconvolution blocks is presented (dubbed MSCViT) to model different scales of\nattention at each layer. Firstly, we introduced wavelet convolution, which\nselectively combines the high-frequency components obtained by frequency\ndivision with our convolution channel to extract local features. Then, a\nlightweight multi-head attention module is developed to reduce the number of\ntokens and computational costs. Finally, the positional encoding (PE) in the\nbackbone is replaced by a local feature extraction module. Compared with the\noriginal ViT, it is parameter-efficient and is particularly suitable for tiny\ndatasets. Extensive experiments have been conducted on tiny datasets, in which\nour model achieves an accuracy of 84.68% on CIFAR-100 with 14.0M parameters and\n2.5 GFLOPs, without pre-training on large datasets.\n","authors":["Bowei Zhang","Yi Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.06040v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08152v1","updated":"2025-01-14T14:26:18Z","published":"2025-01-14T14:26:18Z","title":"Energy Backdoor Attack to Deep Neural Networks","summary":" The rise of deep learning (DL) has increased computing complexity and energy\nuse, prompting the adoption of application specific integrated circuits (ASICs)\nfor energy-efficient edge and mobile deployment. However, recent studies have\ndemonstrated the vulnerability of these accelerators to energy attacks. Despite\nthe development of various inference time energy attacks in prior research,\nbackdoor energy attacks remain unexplored. In this paper, we design an\ninnovative energy backdoor attack against deep neural networks (DNNs) operating\non sparsity-based accelerators. Our attack is carried out in two distinct\nphases: backdoor injection and backdoor stealthiness. Experimental results\nusing ResNet-18 and MobileNet-V2 models trained on CIFAR-10 and Tiny ImageNet\ndatasets show the effectiveness of our proposed attack in increasing energy\nconsumption on trigger samples while preserving the model's performance for\nclean/regular inputs. This demonstrates the vulnerability of DNNs to energy\nbackdoor attacks. The source code of our attack is available at:\nhttps://github.com/hbrachemi/energy_backdoor.\n","authors":["Hanene F. Z. Brachemi Meftah","Wassim Hamidouche","Sid Ahmed Fezza","Olivier Déforges","Kassem Kallas"],"pdf_url":"https://arxiv.org/pdf/2501.08152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.09655v2","updated":"2025-01-14T14:22:05Z","published":"2022-10-18T07:48:59Z","title":"WINE: Wavelet-Guided GAN Inversion and Editing for High-Fidelity\n Refinement","summary":" Recent advanced GAN inversion models aim to convey high-fidelity information\nfrom original images to generators through methods using generator tuning or\nhigh-dimensional feature learning. Despite these efforts, accurately\nreconstructing image-specific details remains as a challenge due to the\ninherent limitations both in terms of training and structural aspects, leading\nto a bias towards low-frequency information. In this paper, we look into the\nwidely used pixel loss in GAN inversion, revealing its predominant focus on the\nreconstruction of low-frequency features. We then propose WINE, a\nWavelet-guided GAN Inversion aNd Editing model, which transfers the\nhigh-frequency information through wavelet coefficients via newly proposed\nwavelet loss and wavelet fusion scheme. Notably, WINE is the first attempt to\ninterpret GAN inversion in the frequency domain. Our experimental results\nshowcase the precision of WINE in preserving high-frequency details and\nenhancing image quality. Even in editing scenarios, WINE outperforms existing\nstate-of-the-art GAN inversion models with a fine balance between editability\nand reconstruction quality.\n","authors":["Chaewon Kim","Seung-Jun Moon","Gyeong-Moon Park"],"pdf_url":"https://arxiv.org/pdf/2210.09655v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08142v1","updated":"2025-01-14T14:21:48Z","published":"2025-01-14T14:21:48Z","title":"Bootstrapping Corner Cases: High-Resolution Inpainting for Safety\n Critical Detect and Avoid for Automated Flying","summary":" Modern machine learning techniques have shown tremendous potential,\nespecially for object detection on camera images. For this reason, they are\nalso used to enable safety-critical automated processes such as autonomous\ndrone flights. We present a study on object detection for Detect and Avoid, a\nsafety critical function for drones that detects air traffic during automated\nflights for safety reasons. An ill-posed problem is the generation of good and\nespecially large data sets, since detection itself is the corner case. Most\nmodels suffer from limited ground truth in raw data, \\eg recorded air traffic\nor frontal flight with a small aircraft. It often leads to poor and critical\ndetection rates. We overcome this problem by using inpainting methods to\nbootstrap the dataset such that it explicitly contains the corner cases of the\nraw data. We provide an overview of inpainting methods and generative models\nand present an example pipeline given a small annotated dataset. We validate\nour method by generating a high-resolution dataset, which we make publicly\navailable and present it to an independent object detector that was fully\ntrained on real data.\n","authors":["Jonathan Lyhs","Lars Hinneburg","Michael Fischer","Florian Ölsner","Stefan Milz","Jeremy Tschirner","Patrick Mäder"],"pdf_url":"https://arxiv.org/pdf/2501.08142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08137v1","updated":"2025-01-14T14:15:10Z","published":"2025-01-14T14:15:10Z","title":"Audio-visual Deepfake Detection With Local Temporal Inconsistencies","summary":" This paper proposes an audio-visual deepfake detection approach that aims to\ncapture fine-grained temporal inconsistencies between audio and visual\nmodalities. To achieve this, both architectural and data synthesis strategies\nare introduced. From an architectural perspective, a temporal distance map,\ncoupled with an attention mechanism, is designed to capture these\ninconsistencies while minimizing the impact of irrelevant temporal\nsubsequences. Moreover, we explore novel pseudo-fake generation techniques to\nsynthesize local inconsistencies. Our approach is evaluated against\nstate-of-the-art methods using the DFDC and FakeAVCeleb datasets, demonstrating\nits effectiveness in detecting audio-visual deepfakes.\n","authors":["Marcella Astrid","Enjie Ghorbel","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2501.08137v1.pdf","comment":"Accepted in ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.06838v2","updated":"2025-01-14T14:09:23Z","published":"2025-01-12T15:14:58Z","title":"Generalized and Efficient 2D Gaussian Splatting for Arbitrary-scale\n Super-Resolution","summary":" Equipped with the continuous representation capability of Multi-Layer\nPerceptron (MLP), Implicit Neural Representation (INR) has been successfully\nemployed for Arbitrary-scale Super-Resolution (ASR). However, the limited\nreceptive field of the linear layers in MLP restricts the representation\ncapability of INR, while it is computationally expensive to query the MLP\nnumerous times to render each pixel. Recently, Gaussian Splatting (GS) has\nshown its advantages over INR in both visual quality and rendering speed in 3D\ntasks, which motivates us to explore whether GS can be employed for the ASR\ntask. However, directly applying GS to ASR is exceptionally challenging because\nthe original GS is an optimization-based method through overfitting each single\nscene, while in ASR we aim to learn a single model that can generalize to\ndifferent images and scaling factors. We overcome these challenges by\ndeveloping two novel techniques. Firstly, to generalize GS for ASR, we\nelaborately design an architecture to predict the corresponding\nimage-conditioned Gaussians of the input low-resolution image in a feed-forward\nmanner. Secondly, we implement an efficient differentiable 2D GPU/CUDA-based\nscale-aware rasterization to render super-resolved images by sampling discrete\nRGB values from the predicted contiguous Gaussians. Via end-to-end training,\nour optimized network, namely GSASR, can perform ASR for any image and unseen\nscaling factors. Extensive experiments validate the effectiveness of our\nproposed method. The project page can be found at\n\\url{https://mt-cly.github.io/GSASR.github.io/}.\n","authors":["Du Chen","Liyi Chen","Zhengqiang Zhang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.06838v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08131v1","updated":"2025-01-14T14:07:48Z","published":"2025-01-14T14:07:48Z","title":"SAR Strikes Back: A New Hope for RSVQA","summary":" Remote sensing visual question answering (RSVQA) is a task that automatically\nextracts information from satellite images and processes a question to predict\nthe answer from the images in textual form, helping with the interpretation of\nthe image. While different methods have been proposed to extract information\nfrom optical images with different spectral bands and resolutions, no method\nhas been proposed to answer questions from Synthetic Aperture Radar (SAR)\nimages. SAR images capture electromagnetic information from the scene, and are\nless affected by atmospheric conditions, such as clouds. In this work, our\nobjective is to introduce SAR in the RSVQA task, finding the best way to use\nthis modality. In our research, we carry out a study on different pipelines for\nthe task of RSVQA taking into account information from both SAR and optical\ndata. To this purpose, we also present a dataset that allows for the\nintroduction of SAR images in the RSVQA framework. We propose two different\nmodels to include the SAR modality. The first one is an end-to-end method in\nwhich we add an additional encoder for the SAR modality. In the second\napproach, we build on a two-stage framework. First, relevant information is\nextracted from SAR and, optionally, optical data. This information is then\ntranslated into natural language to be used in the second step which only\nrelies on a language model to provide the answer. We find that the second\npipeline allows us to obtain good results with SAR images alone. We then try\nvarious types of fusion methods to use SAR and optical images together, finding\nthat a fusion at the decision level achieves the best results on the proposed\ndataset. We show that SAR data offers additional information when fused with\nthe optical modality, particularly for questions related to specific land cover\nclasses, such as water areas.\n","authors":["Lucrezia Tosato","Flora Weissgerber","Laurent Wendling","Sylvain Lobry"],"pdf_url":"https://arxiv.org/pdf/2501.08131v1.pdf","comment":"26 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.08118v1","updated":"2025-01-14T13:51:14Z","published":"2025-01-14T13:51:14Z","title":"Revisiting Birds Eye View Perception Models with Frozen Foundation\n Models: DINOv2 and Metric3Dv2","summary":" Birds Eye View perception models require extensive data to perform and\ngeneralize effectively. While traditional datasets often provide abundant\ndriving scenes from diverse locations, this is not always the case. It is\ncrucial to maximize the utility of the available training data. With the advent\nof large foundation models such as DINOv2 and Metric3Dv2, a pertinent question\narises: can these models be integrated into existing model architectures to not\nonly reduce the required training data but surpass the performance of current\nmodels? We choose two model architectures in the vehicle segmentation domain to\nalter: Lift-Splat-Shoot, and Simple-BEV. For Lift-Splat-Shoot, we explore the\nimplementation of frozen DINOv2 for feature extraction and Metric3Dv2 for depth\nestimation, where we greatly exceed the baseline results by 7.4 IoU while\nutilizing only half the training data and iterations. Furthermore, we introduce\nan innovative application of Metric3Dv2's depth information as a PseudoLiDAR\npoint cloud incorporated into the Simple-BEV architecture, replacing\ntraditional LiDAR. This integration results in a +3 IoU improvement compared to\nthe Camera-only model.\n","authors":["Seamie Hayes","Ganesh Sistu","Ciarán Eising"],"pdf_url":"https://arxiv.org/pdf/2501.08118v1.pdf","comment":"Accepted for publication at the Electronic Imaging - Autonomous\n Vehicles and Machines Connference 2025"},{"id":"http://arxiv.org/abs/2501.08115v1","updated":"2025-01-14T13:46:07Z","published":"2025-01-14T13:46:07Z","title":"RoHan: Robust Hand Detection in Operation Room","summary":" Hand-specific localization has garnered significant interest within the\ncomputer vision community. Although there are numerous datasets with hand\nannotations from various angles and settings, domain transfer techniques\nfrequently struggle in surgical environments. This is mainly due to the limited\navailability of gloved hand instances and the unique challenges of operating\nrooms (ORs). Thus, hand-detection models tailored to OR settings require\nextensive training and expensive annotation processes. To overcome these\nchallenges, we present \"RoHan\" - a novel approach for robust hand detection in\nthe OR, leveraging advanced semi-supervised domain adaptation techniques to\ntackle the challenges of varying recording conditions, diverse glove colors,\nand occlusions common in surgical settings. Our methodology encompasses two\nmain stages: (1) data augmentation strategy that utilizes \"Artificial Gloves,\"\na method for augmenting publicly available hand datasets with synthetic images\nof hands-wearing gloves; (2) semi-supervised domain adaptation pipeline that\nimproves detection performance in real-world OR settings through iterative\nprediction refinement and efficient frame filtering. We evaluate our method\nusing two datasets: simulated enterotomy repair and saphenous vein graft\nharvesting. \"RoHan\" substantially reduces the need for extensive labeling and\nmodel training, paving the way for the practical implementation of hand\ndetection technologies in medical settings.\n","authors":["Roi Papo","Sapir Gershov","Tom Friedman","Itay Or","Gil Bolotin","Shlomi Laufer"],"pdf_url":"https://arxiv.org/pdf/2501.08115v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2501.08114v1","updated":"2025-01-14T13:46:03Z","published":"2025-01-14T13:46:03Z","title":"Change Captioning in Remote Sensing: Evolution to SAT-Cap -- A\n Single-Stage Transformer Approach","summary":" Change captioning has become essential for accurately describing changes in\nmulti-temporal remote sensing data, providing an intuitive way to monitor\nEarth's dynamics through natural language. However, existing change captioning\nmethods face two key challenges: high computational demands due to multistage\nfusion strategy, and insufficient detail in object descriptions due to limited\nsemantic extraction from individual images. To solve these challenges, we\npropose SAT-Cap based on the transformers model with a single-stage feature\nfusion for remote sensing change captioning. In particular, SAT-Cap integrates\na Spatial-Channel Attention Encoder, a Difference-Guided Fusion module, and a\nCaption Decoder. Compared to typical models that require multi-stage fusion in\ntransformer encoder and fusion module, SAT-Cap uses only a simple cosine\nsimilarity-based fusion module for information integration, reducing the\ncomplexity of the model architecture. By jointly modeling spatial and channel\ninformation in Spatial-Channel Attention Encoder, our approach significantly\nenhances the model's ability to extract semantic information from objects in\nmulti-temporal remote sensing images. Extensive experiments validate the\neffectiveness of SAT-Cap, achieving CIDEr scores of 140.23% on the LEVIR-CC\ndataset and 97.74% on the DUBAI-CC dataset, surpassing current state-of-the-art\nmethods. The code and pre-trained models will be available online.\n","authors":["Yuduo Wang","Weikang Yu","Pedram Ghamisi"],"pdf_url":"https://arxiv.org/pdf/2501.08114v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08111v1","updated":"2025-01-14T13:42:22Z","published":"2025-01-14T13:42:22Z","title":"EarthView: A Large Scale Remote Sensing Dataset for Self-Supervision","summary":" This paper presents EarthView, a comprehensive dataset specifically designed\nfor self-supervision on remote sensing data, intended to enhance deep learning\napplications on Earth monitoring tasks. The dataset spans 15 tera pixels of\nglobal remote-sensing data, combining imagery from a diverse range of sources,\nincluding NEON, Sentinel, and a novel release of 1m spatial resolution data\nfrom Satellogic. Our dataset provides a wide spectrum of image data with\nvarying resolutions, harnessed from different sensors and organized coherently\ninto an accessible HuggingFace dataset in parquet format. This data spans five\nyears, from 2017 to 2022. Accompanying the dataset, we introduce EarthMAE, a\ntailored Masked Autoencoder, developed to tackle the distinct challenges of\nremote sensing data. Trained in a self-supervised fashion, EarthMAE effectively\nprocesses different data modalities such as hyperspectral, multispectral,\ntopographical data, segmentation maps, and temporal structure. This model helps\nus show that pre-training on Satellogic data improves performance on downstream\ntasks. While there is still a gap to fill in MAE for heterogeneous data, we\nregard this innovative combination of an expansive, diverse dataset and a\nversatile model adapted for self-supervised learning as a stride forward in\ndeep learning for Earth monitoring.\n","authors":["Diego Velazquez","Pau Rodriguez López","Sergio Alonso","Josep M. Gonfaus","Jordi Gonzalez","Gerardo Richarte","Javier Marin","Yoshua Bengio","Alexandre Lacoste"],"pdf_url":"https://arxiv.org/pdf/2501.08111v1.pdf","comment":"2nd Workshop on Computer Vision for Earth Observation (CV4EO)\n Applications"},{"id":"http://arxiv.org/abs/2312.16409v2","updated":"2025-01-14T13:14:00Z","published":"2023-12-27T04:40:12Z","title":"Dynamic Sub-graph Distillation for Robust Semi-supervised Continual\n Learning","summary":" Continual learning (CL) has shown promising results and comparable\nperformance to learning at once in a fully supervised manner. However, CL\nstrategies typically require a large number of labeled samples, making their\nreal-life deployment challenging. In this work, we focus on semi-supervised\ncontinual learning (SSCL), where the model progressively learns from partially\nlabeled data with unknown categories. We provide a comprehensive analysis of\nSSCL and demonstrate that unreliable distributions of unlabeled data lead to\nunstable training and refinement of the progressing stages. This problem\nseverely impacts the performance of SSCL. To address the limitations, we\npropose a novel approach called Dynamic Sub-Graph Distillation (DSGD) for\nsemi-supervised continual learning, which leverages both semantic and\nstructural information to achieve more stable knowledge distillation on\nunlabeled data and exhibit robustness against distribution bias. Firstly, we\nformalize a general model of structural distillation and design a dynamic graph\nconstruction for the continual learning progress. Next, we define a structure\ndistillation vector and design a dynamic sub-graph distillation algorithm,\nwhich enables end-to-end training and adaptability to scale up tasks. The\nentire proposed method is adaptable to various CL methods and supervision\nsettings. Finally, experiments conducted on three datasets CIFAR10, CIFAR100,\nand ImageNet-100, with varying supervision ratios, demonstrate the\neffectiveness of our proposed approach in mitigating the catastrophic\nforgetting problem in semi-supervised continual learning scenarios.\n","authors":["Yan Fan","Yu Wang","Pengfei Zhu","Qinghua Hu"],"pdf_url":"https://arxiv.org/pdf/2312.16409v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08097v1","updated":"2025-01-14T13:10:29Z","published":"2025-01-14T13:10:29Z","title":"Guiding the classification of hepatocellular carcinoma on 3D CT-scans\n using deep and handcrafted radiological features","summary":" Hepatocellular carcinoma is the most spread primary liver cancer across the\nworld ($\\sim$80\\% of the liver tumors). The gold standard for HCC diagnosis is\nliver biopsy. However, in the clinical routine, expert radiologists provide a\nvisual diagnosis by interpreting hepatic CT-scans according to a standardized\nprotocol, the LI-RADS, which uses five radiological criteria with an associated\ndecision tree. In this paper, we propose an automatic approach to predict\nhistology-proven HCC from CT images in order to reduce radiologists'\ninter-variability. We first show that standard deep learning methods fail to\naccurately predict HCC from CT-scans on a challenging database, and propose a\ntwo-step approach inspired by the LI-RADS system to improve the performance. We\nachieve improvements from 6 to 18 points of AUC with respect to deep learning\nbaselines trained with different architectures. We also provide clinical\nvalidation of our method, achieving results that outperform non-expert\nradiologists and are on par with expert ones.\n","authors":["E. Sarfati","A. Bône","M-M. Rohé","C. Aubé","M. Ronot","P. Gori","I. Bloch"],"pdf_url":"https://arxiv.org/pdf/2501.08097v1.pdf","comment":"IEEE ISBI 2025"},{"id":"http://arxiv.org/abs/2501.08094v1","updated":"2025-01-14T13:09:36Z","published":"2025-01-14T13:09:36Z","title":"CellOMaps: A Compact Representation for Robust Classification of Lung\n Adenocarcinoma Growth Patterns","summary":" Lung adenocarcinoma (LUAD) is a morphologically heterogeneous disease,\ncharacterized by five primary histological growth patterns. The classification\nof such patterns is crucial due to their direct relation to prognosis but the\nhigh subjectivity and observer variability pose a major challenge. Although\nseveral studies have developed machine learning methods for growth pattern\nclassification, they either only report the predominant pattern per slide or\nlack proper evaluation. We propose a generalizable machine learning pipeline\ncapable of classifying lung tissue into one of the five patterns or as\nnon-tumor. The proposed pipeline's strength lies in a novel compact Cell\nOrganization Maps (cellOMaps) representation that captures the cellular spatial\npatterns from Hematoxylin and Eosin whole slide images (WSIs). The proposed\npipeline provides state-of-the-art performance on LUAD growth pattern\nclassification when evaluated on both internal unseen slides and external\ndatasets, significantly outperforming the current approaches. In addition, our\npreliminary results show that the model's outputs can be used to predict\npatients Tumor Mutational Burden (TMB) levels.\n","authors":["Arwa Al-Rubaian","Gozde N. Gunesli","Wajd A. Althakfi","Ayesha Azam","David Snead","Nasir M. Rajpoot","Shan E Ahmed Raza"],"pdf_url":"https://arxiv.org/pdf/2501.08094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08088v1","updated":"2025-01-14T12:57:17Z","published":"2025-01-14T12:57:17Z","title":"AgentPose: Progressive Distribution Alignment via Feature Agent for\n Human Pose Distillation","summary":" Pose distillation is widely adopted to reduce model size in human pose\nestimation. However, existing methods primarily emphasize the transfer of\nteacher knowledge while often neglecting the performance degradation resulted\nfrom the curse of capacity gap between teacher and student. To address this\nissue, we propose AgentPose, a novel pose distillation method that integrates a\nfeature agent to model the distribution of teacher features and progressively\naligns the distribution of student features with that of the teacher feature,\neffectively overcoming the capacity gap and enhancing the ability of knowledge\ntransfer. Our comprehensive experiments conducted on the COCO dataset\nsubstantiate the effectiveness of our method in knowledge transfer,\nparticularly in scenarios with a high capacity gap.\n","authors":["Feng Zhang","Jinwei Liu","Xiatian Zhu","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2501.08088v1.pdf","comment":"5 pages, 1 figures"},{"id":"http://arxiv.org/abs/2302.08878v3","updated":"2025-01-14T12:53:24Z","published":"2023-02-17T13:50:53Z","title":"Less is More: The Influence of Pruning on the Explainability of CNNs","summary":" Over the last century, deep learning models have become the state-of-the-art\nfor solving complex computer vision problems. These modern computer vision\nmodels have millions of parameters, which presents two major challenges: (1)\nthe increased computational requirements hamper the deployment in\nresource-constrained environments, such as mobile or IoT devices, and (2)\nexplaining the complex decisions of such networks to humans is challenging.\nNetwork pruning is a technical approach to reduce the complexity of models,\nwhere less important parameters are removed. The work presented in this paper\ninvestigates whether this reduction in technical complexity also helps with\nperceived explainability. To do so, we conducted a pre-study and two\nhuman-grounded experiments, assessing the effects of different pruning ratios\non explainability. Overall, we evaluate four different compression rates (i.e.,\n2, 4, 8, and 32) with 37 500 tasks on Mechanical Turk. Results indicate that\nlower compression rates have a positive influence on explainability, while\nhigher compression rates show negative effects. Furthermore, we were able to\nidentify sweet spots that increase both the perceived explainability and the\nmodel's performance.\n","authors":["Florian Merkle","David Weber","Pascal Schöttle","Stephan Schlögl","Martin Nocker"],"pdf_url":"https://arxiv.org/pdf/2302.08878v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08083v1","updated":"2025-01-14T12:51:34Z","published":"2025-01-14T12:51:34Z","title":"Benchmarking Vision Foundation Models for Input Monitoring in Autonomous\n Driving","summary":" Deep neural networks (DNNs) remain challenged by distribution shifts in\ncomplex open-world domains like automated driving (AD): Absolute robustness\nagainst yet unknown novel objects (semantic shift) or styles like lighting\nconditions (covariate shift) cannot be guaranteed. Hence, reliable\noperation-time monitors for identification of out-of-training-data-distribution\n(OOD) scenarios are imperative. Current approaches for OOD classification are\nuntested for complex domains like AD, are limited in the kinds of shifts they\ndetect, or even require supervision with OOD samples. To prepare for\nunanticipated shifts, we instead establish a framework around a principled,\nunsupervised, and model-agnostic method that unifies detection of all kinds of\nshifts: Find a full model of the training data's feature distribution, to then\nuse its density at new points as in-distribution (ID) score. To implement this,\nwe propose to combine the newly available Vision Foundation Models (VFM) as\nfeature extractors with one of four alternative density modeling techniques. In\nan extensive benchmark of 4 VFMs against 20 baselines, we show the superior\nperformance of VFM feature encodings compared to shift-specific OOD monitors.\nAdditionally, we find that sophisticated architectures outperform larger latent\nspace dimensionality; and our method identifies samples with higher risk of\nerrors on downstream tasks, despite being model-agnostic. This suggests that\nVFMs are promising to realize model-agnostic, unsupervised, reliable safety\nmonitors in complex vision tasks.\n","authors":["Nert Keser","Halil Ibrahim Orhan","Niki Amini-Naieni","Gesina Schwalbe","Alois Knoll","Matthias Rottmann"],"pdf_url":"https://arxiv.org/pdf/2501.08083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00376v3","updated":"2025-01-14T12:37:26Z","published":"2024-03-01T09:01:53Z","title":"Spurious Feature Eraser: Stabilizing Test-Time Adaptation for\n Vision-Language Foundation Model","summary":" Vision-language foundation models have exhibited remarkable success across a\nmultitude of downstream tasks due to their scalability on extensive image-text\npaired data. However, these models also display significant limitations when\napplied to downstream tasks, such as fine-grained image classification, as a\nresult of ``decision shortcuts'' that hinder their generalization capabilities.\nIn this work, we find that the CLIP model possesses a rich set of features,\nencompassing both \\textit{desired invariant causal features} and\n\\textit{undesired decision shortcuts}. Moreover, the underperformance of CLIP\non downstream tasks originates from its inability to effectively utilize\npre-trained features in accordance with specific task requirements. To address\nthis challenge, we propose a simple yet effective method, Spurious Feature\nEraser (SEraser), to alleviate the decision shortcuts by erasing the spurious\nfeatures. Specifically, we introduce a test-time prompt tuning paradigm that\noptimizes a learnable prompt, thereby compelling the model to exploit invariant\nfeatures while disregarding decision shortcuts during the inference phase. The\nproposed method effectively alleviates excessive dependence on potentially\nmisleading spurious information. We conduct comparative analysis of the\nproposed method against various approaches which validates the significant\nsuperiority.\n","authors":["Huan Ma","Yan Zhu","Changqing Zhang","Peilin Zhao","Baoyuan Wu","Long-Kai Huang","Qinghua Hu","Bingzhe Wu"],"pdf_url":"https://arxiv.org/pdf/2403.00376v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20020v3","updated":"2025-01-14T12:31:48Z","published":"2024-07-29T13:57:24Z","title":"ImagiNet: A Multi-Content Benchmark for Synthetic Image Detection","summary":" Recent generative models produce images with a level of authenticity that\nmakes them nearly indistinguishable from real photos and artwork. Potential\nharmful use cases of these models, necessitate the creation of robust synthetic\nimage detectors. However, current datasets in the field contain generated\nimages with questionable quality or have examples from one predominant content\ntype which leads to poor generalizability of the underlying detectors. We find\nthat the curation of a balanced amount of high-resolution generated images\nacross various content types is crucial for the generalizability of detectors,\nand introduce ImagiNet, a dataset of 200K examples, spanning four categories:\nphotos, paintings, faces, and miscellaneous. Synthetic images in ImagiNet are\nproduced with both open-source and proprietary generators, whereas real\ncounterparts for each content type are collected from public datasets. The\nstructure of ImagiNet allows for a two-track evaluation system: i)\nclassification as real or synthetic and ii) identification of the generative\nmodel. To establish a strong baseline, we train a ResNet-50 model using a\nself-supervised contrastive objective (SelfCon) for each track which achieves\nevaluation AUC of up to 0.99 and balanced accuracy ranging from 86% to 95%,\neven under conditions that involve compression and resizing. The provided model\nis generalizable enough to achieve zero-shot state-of-the-art performance on\nprevious synthetic detection benchmarks. We provide ablations to demonstrate\nthe importance of content types and publish code and data.\n","authors":["Delyan Boychev","Radostin Cholakov"],"pdf_url":"https://arxiv.org/pdf/2407.20020v3.pdf","comment":"Workshop on Datasets and Evaluators of AI Safety, AAAI 2025"},{"id":"http://arxiv.org/abs/2411.02188v4","updated":"2025-01-14T12:27:32Z","published":"2024-11-04T15:42:22Z","title":"Digi2Real: Bridging the Realism Gap in Synthetic Data Face Recognition\n via Foundation Models","summary":" The accuracy of face recognition systems has improved significantly in the\npast few years, thanks to the large amount of data collected and advancements\nin neural network architectures. However, these large-scale datasets are often\ncollected without explicit consent, raising ethical and privacy concerns. To\naddress this, there have been proposals to use synthetic datasets for training\nface recognition models. Yet, such models still rely on real data to train the\ngenerative models and generally exhibit inferior performance compared to those\ntrained on real datasets. One of these datasets, DigiFace, uses a graphics\npipeline to generate different identities and intra-class variations without\nusing real data in model training. However, the performance of this approach is\npoor on face recognition benchmarks, possibly due to the lack of realism in the\nimages generated by the graphics pipeline. In this work, we introduce a novel\nframework for realism transfer aimed at enhancing the realism of synthetically\ngenerated face images. Our method leverages the large-scale face foundation\nmodel, and we adapt the pipeline for realism enhancement. By integrating the\ncontrollable aspects of the graphics pipeline with our realism enhancement\ntechnique, we generate a large amount of realistic variations, combining the\nadvantages of both approaches. Our empirical evaluations demonstrate that\nmodels trained using our enhanced dataset significantly improve the performance\nof face recognition systems over the baseline. The source code and dataset will\nbe publicly accessible at the following link:\nhttps://www.idiap.ch/paper/digi2real\n","authors":["Anjith George","Sebastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2411.02188v4.pdf","comment":"The dataset would be available here:\n https://www.idiap.ch/paper/digi2real Accepted for Publication in WACV 2025"},{"id":"http://arxiv.org/abs/2501.08062v1","updated":"2025-01-14T12:15:49Z","published":"2025-01-14T12:15:49Z","title":"Skeleton and Font Generation Network for Zero-shot Chinese Character\n Generation","summary":" Automatic font generation remains a challenging research issue, primarily due\nto the vast number of Chinese characters, each with unique and intricate\nstructures. Our investigation of previous studies reveals inherent bias capable\nof causing structural changes in characters. Specifically, when generating a\nChinese character similar to, but different from, those in the training\nsamples, the bias is prone to either correcting or ignoring these subtle\nvariations. To address this concern, we propose a novel Skeleton and Font\nGeneration Network (SFGN) to achieve a more robust Chinese character font\ngeneration. Our approach includes a skeleton builder and font generator. The\nskeleton builder synthesizes content features using low-resource text input,\nenabling our technique to realize font generation independently of content\nimage inputs. Unlike previous font generation methods that treat font style as\na global embedding, we introduce a font generator to align content and style\nfeatures on the radical level, which is a brand-new perspective for font\ngeneration. Except for common characters, we also conduct experiments on\nmisspelled characters, a substantial portion of which slightly differs from the\ncommon ones. Our approach visually demonstrates the efficacy of generated\nimages and outperforms current state-of-the-art font generation methods.\nMoreover, we believe that misspelled character generation have significant\npedagogical implications and verify such supposition through experiments. We\nused generated misspelled characters as data augmentation in Chinese character\nerror correction tasks, simulating the scenario where students learn\nhandwritten Chinese characters with the help of misspelled characters. The\nsignificantly improved performance of error correction tasks demonstrates the\neffectiveness of our proposed approach and the value of misspelled character\ngeneration.\n","authors":["Mobai Xue","Jun Du","Zhenrong Zhang","Jiefeng Ma","Qikai Chang","Pengfei Hu","Jianshu Zhang","Yu Hu"],"pdf_url":"https://arxiv.org/pdf/2501.08062v1.pdf","comment":"36 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.09471v5","updated":"2025-01-14T11:59:06Z","published":"2024-03-14T15:10:54Z","title":"MambaTalk: Efficient Holistic Gesture Synthesis with Selective State\n Space Models","summary":" Gesture synthesis is a vital realm of human-computer interaction, with\nwide-ranging applications across various fields like film, robotics, and\nvirtual reality. Recent advancements have utilized the diffusion model and\nattention mechanisms to improve gesture synthesis. However, due to the high\ncomputational complexity of these techniques, generating long and diverse\nsequences with low latency remains a challenge. We explore the potential of\nstate space models (SSMs) to address the challenge, implementing a two-stage\nmodeling strategy with discrete motion priors to enhance the quality of\ngestures. Leveraging the foundational Mamba block, we introduce MambaTalk,\nenhancing gesture diversity and rhythm through multimodal integration.\nExtensive experiments demonstrate that our method matches or exceeds the\nperformance of state-of-the-art models.\n","authors":["Zunnan Xu","Yukang Lin","Haonan Han","Sicheng Yang","Ronghui Li","Yachao Zhang","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2403.09471v5.pdf","comment":"NeurlPS 2024, Camera Ready"},{"id":"http://arxiv.org/abs/2410.03335v2","updated":"2025-01-14T11:59:03Z","published":"2024-10-04T11:40:53Z","title":"Audio-Agent: Leveraging LLMs For Audio Generation, Editing and\n Composition","summary":" We introduce Audio-Agent, a multimodal framework for audio generation,\nediting and composition based on text or video inputs. Conventional approaches\nfor text-to-audio (TTA) tasks often make single-pass inferences from text\ndescriptions. While straightforward, this design struggles to produce\nhigh-quality audio when given complex text conditions. In our method, we\nutilize a pre-trained TTA diffusion network as the audio generation agent to\nwork in tandem with GPT-4, which decomposes the text condition into atomic,\nspecific instructions and calls the agent for audio generation. In doing so,\nAudio-Agent can generate high-quality audio that is closely aligned with the\nprovided text or video exhibiting complex and multiple events, while supporting\nvariable-length and variable-volume generation. For video-to-audio (VTA) tasks,\nmost existing methods require training a timestamp detector to synchronize\nvideo events with the generated audio, a process that can be tedious and\ntime-consuming. Instead, we propose a simpler approach by fine-tuning a\npre-trained Large Language Model (LLM), e.g., Gemma2-2B-it, to obtain both\nsemantic and temporal conditions that bridge the video and audio modality.\nConsequently, our framework contributes a comprehensive solution for both TTA\nand VTA tasks without substantial computational overhead in training.\n","authors":["Zixuan Wang","Chi-Keung Tang","Yu-Wing Tai"],"pdf_url":"https://arxiv.org/pdf/2410.03335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08049v1","updated":"2025-01-14T11:56:00Z","published":"2025-01-14T11:56:00Z","title":"Self-Attentive Spatio-Temporal Calibration for Precise Intermediate\n Layer Matching in ANN-to-SNN Distillation","summary":" Spiking Neural Networks (SNNs) are promising for low-power computation due to\ntheir event-driven mechanism but often suffer from lower accuracy compared to\nArtificial Neural Networks (ANNs). ANN-to-SNN knowledge distillation can\nimprove SNN performance, but previous methods either focus solely on label\ninformation, missing valuable intermediate layer features, or use a layer-wise\napproach that neglects spatial and temporal semantic inconsistencies, leading\nto performance degradation.To address these limitations, we propose a novel\nmethod called self-attentive spatio-temporal calibration (SASTC). SASTC uses\nself-attention to identify semantically aligned layer pairs between ANN and\nSNN, both spatially and temporally. This enables the autonomous transfer of\nrelevant semantic information. Extensive experiments show that SASTC\noutperforms existing methods, effectively solving the mismatching problem.\nSuperior accuracy results include 95.12% on CIFAR-10, 79.40% on CIFAR-100 with\n2 time steps, and 68.69% on ImageNet with 4 time steps for static datasets, and\n97.92% on DVS-Gesture and 83.60% on DVS-CIFAR10 for neuromorphic datasets. This\nmarks the first time SNNs have outperformed ANNs on both CIFAR-10 and\nCIFAR-100, shedding the new light on the potential applications of SNNs.\n","authors":["Di Hong","Yueming Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08042v1","updated":"2025-01-14T11:47:35Z","published":"2025-01-14T11:47:35Z","title":"Exploring visual language models as a powerful tool in the diagnosis of\n Ewing Sarcoma","summary":" Ewing's sarcoma (ES), characterized by a high density of small round blue\ncells without structural organization, presents a significant health concern,\nparticularly among adolescents aged 10 to 19. Artificial intelligence-based\nsystems for automated analysis of histopathological images are promising to\ncontribute to an accurate diagnosis of ES. In this context, this study explores\nthe feature extraction ability of different pre-training strategies for\ndistinguishing ES from other soft tissue or bone sarcomas with similar\nmorphology in digitized tissue microarrays for the first time, as far as we\nknow. Vision-language supervision (VLS) is compared to fully-supervised\nImageNet pre-training within a multiple instance learning paradigm. Our\nfindings indicate a substantial improvement in diagnostic accuracy with the\nadaption of VLS using an in-domain dataset. Notably, these models not only\nenhance the accuracy of predicted classes but also drastically reduce the\nnumber of trainable parameters and computational costs.\n","authors":["Alvaro Pastor-Naranjo","Pablo Meseguer","Rocío del Amor","Jose Antonio Lopez-Guerrero","Samuel Navarro","Katia Scotlandi","Antonio Llombart-Bosch","Isidro Machado","Valery Naranjo"],"pdf_url":"https://arxiv.org/pdf/2501.08042v1.pdf","comment":"11 pages, 5 figures, 2 tables. Oral presentation at KES-InMed 2024\n held in Madeira, Portugal"},{"id":"http://arxiv.org/abs/2501.08038v1","updated":"2025-01-14T11:42:54Z","published":"2025-01-14T11:42:54Z","title":"Robust Low-Light Human Pose Estimation through Illumination-Texture\n Modulation","summary":" As critical visual details become obscured, the low visibility and high ISO\nnoise in extremely low-light images pose a significant challenge to human pose\nestimation. Current methods fail to provide high-quality representations due to\nreliance on pixel-level enhancements that compromise semantics and the\ninability to effectively handle extreme low-light conditions for robust feature\nlearning. In this work, we propose a frequency-based framework for low-light\nhuman pose estimation, rooted in the \"divide-and-conquer\" principle. Instead of\nuniformly enhancing the entire image, our method focuses on task-relevant\ninformation. By applying dynamic illumination correction to the low-frequency\ncomponents and low-rank denoising to the high-frequency components, we\neffectively enhance both the semantic and texture information essential for\naccurate pose estimation. As a result, this targeted enhancement method results\nin robust, high-quality representations, significantly improving pose\nestimation performance. Extensive experiments demonstrating its superiority\nover state-of-the-art methods in various challenging low-light scenarios.\n","authors":["Feng Zhang","Ze Li","Xiatian Zhu","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2501.08038v1.pdf","comment":"5 pages, 2 figures, conference"},{"id":"http://arxiv.org/abs/2409.16597v3","updated":"2025-01-14T11:27:28Z","published":"2024-09-25T03:49:46Z","title":"EventHallusion: Diagnosing Event Hallucinations in Video LLMs","summary":" Recently, Multimodal Large Language Models (MLLMs) have made significant\nprogress in the video comprehension field. Despite remarkable content reasoning\nand instruction following capabilities they demonstrated, the hallucination\nproblem of these VideoLLMs is less explored compared with its counterpart in\nthe image domain. To mitigate this gap, we propose EventHallusion, a novel\nbenchmark that focuses on assessing the VideoLLMs' hallucination toward event,\nthe crux of video analysis. From a hallucination attribution perspective, our\nEventHallusion benchmark is curated to assess a VideoLLM's susceptibility\ntoward language priors and vision-language biases. On the other hand, we also\npropose a simple yet effective method, called Temporal Contrastive Decoding\n(TCD), to tackle the hallucination problems of VideoLLMs. The proposed TCD\nmethod rectifies the model's bias toward its priors during the decoding stage\nby comparing the original video with a modified version, in which temporal cues\nare disrupted. Through comprehensive evaluation of eight open-source and two\nclosed-source VideoLLMs on the proposed EventHallusion benchmark, we observe\nthat the open-source models suffer significantly from hallucination problems,\nwhereas the closed-source ones perform markedly better. By further equipping\nopen-source VideoLLMs with the proposed TCD approach, evident performance\nimprovements are achieved across most metrics in the EventHallusion benchmark.\nOur codes and benchmark data are available at\nhttps://github.com/Stevetich/EventHallusion.\n","authors":["Jiacheng Zhang","Yang Jiao","Shaoxiang Chen","Na Zhao","Jingjing Chen"],"pdf_url":"https://arxiv.org/pdf/2409.16597v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02774v3","updated":"2025-01-14T11:14:57Z","published":"2024-03-05T08:41:41Z","title":"Fast, Scale-Adaptive, and Uncertainty-Aware Downscaling of Earth System\n Model Fields with Generative Machine Learning","summary":" Accurate and high-resolution Earth system model (ESM) simulations are\nessential to assess the ecological and socio-economic impacts of anthropogenic\nclimate change, but are computationally too expensive to be run at sufficiently\nhigh spatial resolution. Recent machine learning approaches have shown\npromising results in downscaling ESM simulations, outperforming\nstate-of-the-art statistical approaches. However, existing methods require\ncomputationally costly retraining for each ESM and extrapolate poorly to\nclimates unseen during training. We address these shortcomings by learning a\nconsistency model (CM) that efficiently and accurately downscales arbitrary ESM\nsimulations without retraining in a zero-shot manner. Our approach yields\nprobabilistic downscaled fields at a resolution only limited by the\nobservational reference data. We show that the CM outperforms state-of-the-art\ndiffusion models at a fraction of computational cost while maintaining high\ncontrollability on the downscaling task. Further, our method generalizes to\nclimate states unseen during training without explicitly formulated physical\nconstraints.\n","authors":["Philipp Hess","Michael Aich","Baoxiang Pan","Niklas Boers"],"pdf_url":"https://arxiv.org/pdf/2403.02774v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04594v2","updated":"2025-01-14T11:03:05Z","published":"2024-12-05T20:15:34Z","title":"Learning Symmetries via Weight-Sharing with Doubly Stochastic Tensors","summary":" Group equivariance has emerged as a valuable inductive bias in deep learning,\nenhancing generalization, data efficiency, and robustness. Classically, group\nequivariant methods require the groups of interest to be known beforehand,\nwhich may not be realistic for real-world data. Additionally, baking in fixed\ngroup equivariance may impose overly restrictive constraints on model\narchitecture. This highlights the need for methods that can dynamically\ndiscover and apply symmetries as soft constraints. For neural network\narchitectures, equivariance is commonly achieved through group transformations\nof a canonical weight tensor, resulting in weight sharing over a given group\n$G$. In this work, we propose to learn such a weight-sharing scheme by defining\na collection of learnable doubly stochastic matrices that act as soft\npermutation matrices on canonical weight tensors, which can take regular group\nrepresentations as a special case. This yields learnable kernel transformations\nthat are jointly optimized with downstream tasks. We show that when the dataset\nexhibits strong symmetries, the permutation matrices will converge to regular\ngroup representations and our weight-sharing networks effectively become\nregular group convolutions. Additionally, the flexibility of the method enables\nit to effectively pick up on partial symmetries.\n","authors":["Putri A. van der Linden","Alejandro García-Castellanos","Sharvaree Vadgama","Thijs P. Kuipers","Erik J. Bekkers"],"pdf_url":"https://arxiv.org/pdf/2412.04594v2.pdf","comment":"19 pages, 14 figures, 4 tables"},{"id":"http://arxiv.org/abs/2407.15500v3","updated":"2025-01-14T11:02:13Z","published":"2024-07-22T09:31:30Z","title":"TextureCrop: Enhancing Synthetic Image Detection through Texture-based\n Cropping","summary":" Generative AI technologies produce increasingly realistic imagery, which,\ndespite its potential for creative applications, can also be misused to produce\nmisleading and harmful content. This renders Synthetic Image Detection (SID)\nmethods essential for identifying AI-generated content online. State-of-the-art\nSID methods typically resize or center-crop input images due to architectural\nor computational constraints, which hampers the detection of artifacts that\nappear in high-resolution images. To address this limitation, we propose\nTextureCrop, an image pre-processing component that can be plugged in any\npre-trained SID model to improve its performance. By focusing on high-frequency\nimage parts where generative artifacts are prevalent, TextureCrop enhances SID\nperformance with manageable memory requirements. Experimental results\ndemonstrate a consistent improvement in AUC across various detectors by 6.1%\ncompared to center cropping and by 15% compared to resizing, across\nhigh-resolution images from the Forensynths, Synthbuster and TWIGMA datasets.\nCode available at https : //github.com/mever-team/texture-crop.\n","authors":["Despina Konstantinidou","Christos Koutlis","Symeon Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2407.15500v3.pdf","comment":"10 pages, 7 images"},{"id":"http://arxiv.org/abs/2408.07583v2","updated":"2025-01-14T10:52:15Z","published":"2024-08-14T14:28:11Z","title":"Transformers and Large Language Models for Efficient Intrusion Detection\n Systems: A Comprehensive Survey","summary":" With significant advancements in Transformers LLMs, NLP has extended its\nreach into many research fields due to its enhanced capabilities in text\ngeneration and user interaction. One field benefiting greatly from these\nadvancements is cybersecurity. In cybersecurity, many parameters that need to\nbe protected and exchanged between senders and receivers are in the form of\ntext and tabular data, making NLP a valuable tool in enhancing the security\nmeasures of communication protocols. This survey paper provides a comprehensive\nanalysis of the utilization of Transformers and LLMs in cyber-threat detection\nsystems. The methodology of paper selection and bibliometric analysis is\noutlined to establish a rigorous framework for evaluating existing research.\nThe fundamentals of Transformers are discussed, including background\ninformation on various cyber-attacks and datasets commonly used in this field.\nThe survey explores the application of Transformers in IDSs, focusing on\ndifferent architectures such as Attention-based models, LLMs like BERT and GPT,\nCNN/LSTM-Transformer hybrids, emerging approaches like ViTs, among others.\nFurthermore, it explores the diverse environments and applications where\nTransformers and LLMs-based IDS have been implemented, including computer\nnetworks, IoT devices, critical infrastructure protection, cloud computing,\nSDN, as well as in autonomous vehicles. The paper also addresses research\nchallenges and future directions in this area, identifying key issues such as\ninterpretability, scalability, and adaptability to evolving threats, and more.\nFinally, the conclusion summarizes the findings and highlights the significance\nof Transformers and LLMs in enhancing cyber-threat detection capabilities,\nwhile also outlining potential avenues for further research and development.\n","authors":["Hamza Kheddar"],"pdf_url":"https://arxiv.org/pdf/2408.07583v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2405.04760 by other authors"},{"id":"http://arxiv.org/abs/2501.08005v1","updated":"2025-01-14T10:49:26Z","published":"2025-01-14T10:49:26Z","title":"DisCoPatch: Batch Statistics Are All You Need For OOD Detection, But\n Only If You Can Trust Them","summary":" Out-of-distribution (OOD) detection holds significant importance across many\napplications. While semantic and domain-shift OOD problems are well-studied,\nthis work focuses on covariate shifts - subtle variations in the data\ndistribution that can degrade machine learning performance. We hypothesize that\ndetecting these subtle shifts can improve our understanding of in-distribution\nboundaries, ultimately improving OOD detection. In adversarial discriminators\ntrained with Batch Normalization (BN), real and adversarial samples form\ndistinct domains with unique batch statistics - a property we exploit for OOD\ndetection. We introduce DisCoPatch, an unsupervised Adversarial Variational\nAutoencoder (VAE) framework that harnesses this mechanism. During inference,\nbatches consist of patches from the same image, ensuring a consistent data\ndistribution that allows the model to rely on batch statistics. DisCoPatch uses\nthe VAE's suboptimal outputs (generated and reconstructed) as negative samples\nto train the discriminator, thereby improving its ability to delineate the\nboundary between in-distribution samples and covariate shifts. By tightening\nthis boundary, DisCoPatch achieves state-of-the-art results in public OOD\ndetection benchmarks. The proposed model not only excels in detecting covariate\nshifts, achieving 95.5% AUROC on ImageNet-1K(-C) but also outperforms all prior\nmethods on public Near-OOD (95.0%) benchmarks. With a compact model size of\n25MB, it achieves high OOD detection performance at notably lower latency than\nexisting methods, making it an efficient and practical solution for real-world\nOOD detection applications. The code will be made publicly available\n","authors":["Francisco Caetano","Christiaan Viviers","Luis A. Zavala-Mondragón","Peter H. N. de With","Fons van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2501.08005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08002v1","updated":"2025-01-14T10:46:41Z","published":"2025-01-14T10:46:41Z","title":"Maximizing Uncertainty for Federated learning via Bayesian\n Optimisation-based Model Poisoning","summary":" As we transition from Narrow Artificial Intelligence towards Artificial Super\nIntelligence, users are increasingly concerned about their privacy and the\ntrustworthiness of machine learning (ML) technology. A common denominator for\nthe metrics of trustworthiness is the quantification of uncertainty inherent in\nDL algorithms, and specifically in the model parameters, input data, and model\npredictions. One of the common approaches to address privacy-related issues in\nDL is to adopt distributed learning such as federated learning (FL), where\nprivate raw data is not shared among users. Despite the privacy-preserving\nmechanisms in FL, it still faces challenges in trustworthiness. Specifically,\nthe malicious users, during training, can systematically create malicious model\nparameters to compromise the models predictive and generative capabilities,\nresulting in high uncertainty about their reliability. To demonstrate malicious\nbehaviour, we propose a novel model poisoning attack method named Delphi which\naims to maximise the uncertainty of the global model output. We achieve this by\ntaking advantage of the relationship between the uncertainty and the model\nparameters of the first hidden layer of the local model. Delphi employs two\ntypes of optimisation , Bayesian Optimisation and Least Squares Trust Region,\nto search for the optimal poisoned model parameters, named as Delphi-BO and\nDelphi-LSTR. We quantify the uncertainty using the KL Divergence to minimise\nthe distance of the predictive probability distribution towards an uncertain\ndistribution of model output. Furthermore, we establish a mathematical proof\nfor the attack effectiveness demonstrated in FL. Numerical results demonstrate\nthat Delphi-BO induces a higher amount of uncertainty than Delphi-LSTR\nhighlighting vulnerability of FL systems to model poisoning attacks.\n","authors":["Marios Aristodemou","Xiaolan Liu","Yuan Wang","Konstantinos G. Kyriakopoulos","Sangarapillai Lambotharan","Qingsong Wei"],"pdf_url":"https://arxiv.org/pdf/2501.08002v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2501.07994v1","updated":"2025-01-14T10:38:18Z","published":"2025-01-14T10:38:18Z","title":"Combining imaging and shape features for prediction tasks of Alzheimer's\n disease classification and brain age regression","summary":" We investigate combining imaging and shape features extracted from MRI for\nthe clinically relevant tasks of brain age prediction and Alzheimer's disease\nclassification. Our proposed model fuses ResNet-extracted image embeddings with\nshape embeddings from a bespoke graph neural network. The shape embeddings are\nderived from surface meshes of 15 brain structures, capturing detailed\ngeometric information. Combined with the appearance features from T1-weighted\nimages, we observe improvements in the prediction performance on both tasks,\nwith substantial gains for classification. We evaluate the model using public\ndatasets, including CamCAN, IXI, and OASIS3, demonstrating the effectiveness of\nfusing imaging and shape features for brain analysis.\n","authors":["Nairouz Shehata","Carolina Piçarra","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2501.07994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03033v3","updated":"2025-01-14T10:34:00Z","published":"2024-11-05T12:10:02Z","title":"Rethinking Decoders for Transformer-based Semantic Segmentation: A\n Compression Perspective","summary":" State-of-the-art methods for Transformer-based semantic segmentation\ntypically adopt Transformer decoders that are used to extract additional\nembeddings from image embeddings via cross-attention, refine either or both\ntypes of embeddings via self-attention, and project image embeddings onto the\nadditional embeddings via dot-product. Despite their remarkable success, these\nempirical designs still lack theoretical justifications or interpretations,\nthus hindering potentially principled improvements. In this paper, we argue\nthat there are fundamental connections between semantic segmentation and\ncompression, especially between the Transformer decoders and Principal\nComponent Analysis (PCA). From such a perspective, we derive a white-box, fully\nattentional DEcoder for PrIncipled semantiC segemenTation (DEPICT), with the\ninterpretations as follows: 1) the self-attention operator refines image\nembeddings to construct an ideal principal subspace that aligns with the\nsupervision and retains most information; 2) the cross-attention operator seeks\nto find a low-rank approximation of the refined image embeddings, which is\nexpected to be a set of orthonormal bases of the principal subspace and\ncorresponds to the predefined classes; 3) the dot-product operation yields\ncompact representation for image embeddings as segmentation masks. Experiments\nconducted on dataset ADE20K find that DEPICT consistently outperforms its\nblack-box counterpart, Segmenter, and it is light weight and more robust.\n","authors":["Qishuai Wen","Chun-Guang Li"],"pdf_url":"https://arxiv.org/pdf/2411.03033v3.pdf","comment":"NeurIPS2024. Code:https://github.com/QishuaiWen/DEPICT/"},{"id":"http://arxiv.org/abs/2407.10377v3","updated":"2025-01-14T10:30:19Z","published":"2024-07-15T01:11:30Z","title":"Enhanced Masked Image Modeling to Avoid Model Collapse on Multi-modal\n MRI Datasets","summary":" Multi-modal magnetic resonance imaging (MRI) provides information of lesions\nfor computer-aided diagnosis from different views. Deep learning algorithms are\nsuitable for identifying specific anatomical structures, segmenting lesions,\nand classifying diseases. Manual labels are limited due to the high expense,\nwhich hinders further improvement of accuracy. Self-supervised learning,\nparticularly masked image modeling (MIM), has shown promise in utilizing\nunlabeled data. However, we spot model collapse when applying MIM to\nmulti-modal MRI datasets. The performance of downstream tasks does not see any\nimprovement following the collapsed model. To solve model collapse, we analyze\nand address it in two types: complete collapse and dimensional collapse. We\nfind complete collapse occurs because the collapsed loss value in multi-modal\nMRI datasets falls below the normally converged loss value. Based on this, the\nhybrid mask pattern (HMP) masking strategy is introduced to elevate the\ncollapsed loss above the normally converged loss value and avoid complete\ncollapse. Additionally, we reveal that dimensional collapse stems from\ninsufficient feature uniformity in MIM. We mitigate dimensional collapse by\nintroducing the pyramid barlow twins (PBT) module as an explicit regularization\nmethod. Overall, we construct the enhanced MIM (E-MIM) with HMP and PBT module\nto avoid model collapse multi-modal MRI. Experiments are conducted on three\nmulti-modal MRI datasets to validate the effectiveness of our approach in\npreventing both types of model collapse. By preventing model collapse, the\ntraining of the model becomes more stable, resulting in a decent improvement in\nperformance for segmentation and classification tasks. The code is available at\nhttps://github.com/LinxuanHan/E-MIM.\n","authors":["Linxuan Han","Sa Xiao","Zimeng Li","Haidong Li","Xiuchao Zhao","Yeqing Han","Fumin Guo","Xin Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.10377v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02297v2","updated":"2025-01-14T10:27:40Z","published":"2024-08-05T08:14:28Z","title":"Perception Matters: Enhancing Embodied AI with Uncertainty-Aware\n Semantic Segmentation","summary":" Embodied AI has made significant progress acting in unexplored environments.\nHowever, tasks such as object search have largely focused on efficient policy\nlearning. In this work, we identify several gaps in current search methods:\nThey largely focus on dated perception models, neglect temporal aggregation,\nand transfer from ground truth directly to noisy perception at test time,\nwithout accounting for the resulting overconfidence in the perceived state. We\naddress the identified problems through calibrated perception probabilities and\nuncertainty across aggregation and found decisions, thereby adapting the models\nfor sequential tasks. The resulting methods can be directly integrated with\npretrained models across a wide family of existing search approaches at no\nadditional training cost. We perform extensive evaluations of aggregation\nmethods across both different semantic perception models and policies,\nconfirming the importance of calibrated uncertainties in both the aggregation\nand found decisions. We make the code and trained models available at\nhttps://semantic-search.cs.uni-freiburg.de.\n","authors":["Sai Prasanna","Daniel Honerkamp","Kshitij Sirohi","Tim Welschehold","Wolfram Burgard","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2408.02297v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07988v1","updated":"2025-01-14T10:24:20Z","published":"2025-01-14T10:24:20Z","title":"GAC-Net_Geometric and attention-based Network for Depth Completion","summary":" Depth completion is a key task in autonomous driving, aiming to complete\nsparse LiDAR depth measurements into high-quality dense depth maps through\nimage guidance. However, existing methods usually treat depth maps as an\nadditional channel of color images, or directly perform convolution on sparse\ndata, failing to fully exploit the 3D geometric information in depth maps,\nespecially with limited performance in complex boundaries and sparse areas. To\naddress these issues, this paper proposes a depth completion network combining\nchannel attention mechanism and 3D global feature perception (CGA-Net). The\nmain innovations include: 1) Utilizing PointNet++ to extract global 3D\ngeometric features from sparse depth maps, enhancing the scene perception\nability of low-line LiDAR data; 2) Designing a channel-attention-based\nmultimodal feature fusion module to efficiently integrate sparse depth, RGB\nimages, and 3D geometric features; 3) Combining residual learning with CSPN++\nto optimize the depth refinement stage, further improving the completion\nquality in edge areas and complex scenes. Experiments on the KITTI depth\ncompletion dataset show that CGA-Net can significantly improve the prediction\naccuracy of dense depth maps, achieving a new state-of-the-art (SOTA), and\ndemonstrating strong robustness to sparse and complex scenes.\n","authors":["Kuang Zhu","Xingli Gan","Min Sun"],"pdf_url":"https://arxiv.org/pdf/2501.07988v1.pdf","comment":"13pages,4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.07984v1","updated":"2025-01-14T10:09:55Z","published":"2025-01-14T10:09:55Z","title":"Threshold Attention Network for Semantic Segmentation of Remote Sensing\n Images","summary":" Semantic segmentation of remote sensing images is essential for various\napplications, including vegetation monitoring, disaster management, and urban\nplanning. Previous studies have demonstrated that the self-attention mechanism\n(SA) is an effective approach for designing segmentation networks that can\ncapture long-range pixel dependencies. SA enables the network to model the\nglobal dependencies between the input features, resulting in improved\nsegmentation outcomes. However, the high density of attentional feature maps\nused in this mechanism causes exponential increases in computational\ncomplexity. Additionally, it introduces redundant information that negatively\nimpacts the feature representation. Inspired by traditional threshold\nsegmentation algorithms, we propose a novel threshold attention mechanism\n(TAM). This mechanism significantly reduces computational effort while also\nbetter modeling the correlation between different regions of the feature map.\nBased on TAM, we present a threshold attention network (TANet) for semantic\nsegmentation. TANet consists of an attentional feature enhancement module\n(AFEM) for global feature enhancement of shallow features and a threshold\nattention pyramid pooling module (TAPP) for acquiring feature information at\ndifferent scales for deep features. We have conducted extensive experiments on\nthe ISPRS Vaihingen and Potsdam datasets. The results demonstrate the validity\nand superiority of our proposed TANet compared to the most state-of-the-art\nmodels.\n","authors":["Wei Long","Yongjun Zhang","Zhongwei Cui","Yujie Xu","Xuexue Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07983v1","updated":"2025-01-14T10:06:02Z","published":"2025-01-14T10:06:02Z","title":"V-Trans4Style: Visual Transition Recommendation for Video Production\n Style Adaptation","summary":" We introduce V-Trans4Style, an innovative algorithm tailored for dynamic\nvideo content editing needs. It is designed to adapt videos to different\nproduction styles like documentaries, dramas, feature films, or a specific\nYouTube channel's video-making technique. Our algorithm recommends optimal\nvisual transitions to help achieve this flexibility using a more bottom-up\napproach. We first employ a transformer-based encoder-decoder network to learn\nrecommending temporally consistent and visually seamless sequences of visual\ntransitions using only the input videos. We then introduce a style conditioning\nmodule that leverages this model to iteratively adjust the visual transitions\nobtained from the decoder through activation maximization. We demonstrate the\nefficacy of our method through experiments conducted on our newly introduced\nAutoTransition++ dataset. It is a 6k video version of AutoTransition Dataset\nthat additionally categorizes its videos into different production style\ncategories. Our encoder-decoder model outperforms the state-of-the-art\ntransition recommendation method, achieving improvements of 10% to 80% in\nRecall@K and mean rank values over baseline. Our style conditioning module\nresults in visual transitions that improve the capture of the desired video\nproduction style characteristics by an average of around 12% in comparison to\nother methods when measured with similarity metrics. We hope that our work\nserves as a foundation for exploring and understanding video production styles\nfurther.\n","authors":["Pooja Guhan","Tsung-Wei Huang","Guan-Ming Su","Subhadra Gopalakrishnan","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2501.07983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07978v1","updated":"2025-01-14T09:52:56Z","published":"2025-01-14T09:52:56Z","title":"Facial Dynamics in Video: Instruction Tuning for Improved Facial\n Expression Perception and Contextual Awareness","summary":" Facial expression captioning has found widespread application across various\ndomains. Recently, the emergence of video Multimodal Large Language Models\n(MLLMs) has shown promise in general video understanding tasks. However,\ndescribing facial expressions within videos poses two major challenges for\nthese models: (1) the lack of adequate datasets and benchmarks, and (2) the\nlimited visual token capacity of video MLLMs. To address these issues, this\npaper introduces a new instruction-following dataset tailored for dynamic\nfacial expression caption. The dataset comprises 5,033 high-quality video clips\nannotated manually, containing over 700,000 tokens. Its purpose is to improve\nthe capability of video MLLMs to discern subtle facial nuances. Furthermore, we\npropose FaceTrack-MM, which leverages a limited number of tokens to encode the\nmain character's face. This model demonstrates superior performance in tracking\nfaces and focusing on the facial expressions of the main characters, even in\nintricate multi-person scenarios. Additionally, we introduce a novel evaluation\nmetric combining event extraction, relation classification, and the longest\ncommon subsequence (LCS) algorithm to assess the content consistency and\ntemporal sequence consistency of generated text. Moreover, we present\nFEC-Bench, a benchmark designed to assess the performance of existing video\nMLLMs in this specific task. All data and source code will be made publicly\navailable.\n","authors":["Jiaxing Zhao","Boyuan Sun","Xiang Chen","Xihan Wei"],"pdf_url":"https://arxiv.org/pdf/2501.07978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07972v1","updated":"2025-01-14T09:45:10Z","published":"2025-01-14T09:45:10Z","title":"Zero-shot Video Moment Retrieval via Off-the-shelf Multimodal Large\n Language Models","summary":" The target of video moment retrieval (VMR) is predicting temporal spans\nwithin a video that semantically match a given linguistic query. Existing VMR\nmethods based on multimodal large language models (MLLMs) overly rely on\nexpensive high-quality datasets and time-consuming fine-tuning. Although some\nrecent studies introduce a zero-shot setting to avoid fine-tuning, they\noverlook inherent language bias in the query, leading to erroneous\nlocalization. To tackle the aforementioned challenges, this paper proposes\nMoment-GPT, a tuning-free pipeline for zero-shot VMR utilizing frozen MLLMs.\nSpecifically, we first employ LLaMA-3 to correct and rephrase the query to\nmitigate language bias. Subsequently, we design a span generator combined with\nMiniGPT-v2 to produce candidate spans adaptively. Finally, to leverage the\nvideo comprehension capabilities of MLLMs, we apply VideoChatGPT and span\nscorer to select the most appropriate spans. Our proposed method substantially\noutperforms the state-ofthe-art MLLM-based and zero-shot models on several\npublic datasets, including QVHighlights, ActivityNet-Captions, and\nCharades-STA.\n","authors":["Yifang Xu","Yunzhuo Sun","Benxiang Zhai","Ming Li","Wenxin Liang","Yang Li","Sidan Du"],"pdf_url":"https://arxiv.org/pdf/2501.07972v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2409.09610v2","updated":"2025-01-14T09:44:01Z","published":"2024-09-15T04:34:38Z","title":"TextureDiffusion: Target Prompt Disentangled Editing for Various Texture\n Transfer","summary":" Recently, text-guided image editing has achieved significant success.\nHowever, existing methods can only apply simple textures like wood or gold when\nchanging the texture of an object. Complex textures such as cloud or fire pose\na challenge. This limitation stems from that the target prompt needs to contain\nboth the input image content and , restricting the texture\nrepresentation. In this paper, we propose TextureDiffusion, a tuning-free image\nediting method applied to various texture transfer. Initially, the target\nprompt is directly set to \"\", making the texture disentangled from the\ninput image content to enhance texture representation. Subsequently, query\nfeatures in self-attention and features in residual blocks are utilized to\npreserve the structure of the input image. Finally, to maintain the background,\nwe introduce an edit localization technique which blends the self-attention\nresults and the intermediate latents. Comprehensive experiments demonstrate\nthat TextureDiffusion can harmoniously transfer various textures with excellent\nstructure and background preservation. Code is publicly available at\nhttps://github.com/THU-CVML/TextureDiffusion\n","authors":["Zihan Su","Junhao Zhuang","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2409.09610v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03907v2","updated":"2025-01-14T09:40:53Z","published":"2024-12-05T06:26:32Z","title":"ONER: Online Experience Replay for Incremental Anomaly Detection","summary":" Incremental anomaly detection sequentially recognizes abnormal regions in\nnovel categories for dynamic industrial scenarios. This remains highly\nchallenging due to knowledge overwriting and feature conflicts, leading to\ncatastrophic forgetting. In this work, we propose ONER, an end-to-end ONline\nExperience Replay method, which efficiently mitigates catastrophic forgetting\nwhile adapting to new tasks with minimal cost. Specifically, our framework\nutilizes two types of experiences from past tasks: decomposed prompts and\nsemantic prototypes, addressing both model parameter updates and feature\noptimization. The decomposed prompts consist of learnable components that\nassemble to produce attention-conditioned prompts. These prompts reuse\npreviously learned knowledge, enabling model to learn novel tasks effectively.\nThe semantic prototypes operate at both pixel and image levels, performing\nregularization in the latent feature space to prevent forgetting across various\ntasks. Extensive experiments demonstrate that our method achieves\nstate-of-the-art performance in incremental anomaly detection with\nsignificantly reduced forgetting, as well as efficiently adapting to new\ncategories with minimal costs. These results confirm the efficiency and\nstability of ONER, making it a powerful solution for real-world applications.\n","authors":["Yizhou Jin","Jiahui Zhu","Guodong Wang","Shiwei Li","Jinjin Zhang","Qingjie Liu","Xinyue Liu","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2412.03907v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07960v1","updated":"2025-01-14T09:24:27Z","published":"2025-01-14T09:24:27Z","title":"SkipClick: Combining Quick Responses and Low-Level Features for\n Interactive Segmentation in Winter Sports Contexts","summary":" In this paper, we present a novel architecture for interactive segmentation\nin winter sports contexts. The field of interactive segmentation deals with the\nprediction of high-quality segmentation masks by informing the network about\nthe objects position with the help of user guidance. In our case the guidance\nconsists of click prompts. For this task, we first present a baseline\narchitecture which is specifically geared towards quickly responding after each\nclick. Afterwards, we motivate and describe a number of architectural\nmodifications which improve the performance when tasked with segmenting winter\nsports equipment on the WSESeg dataset. With regards to the average NoC@85\nmetric on the WSESeg classes, we outperform SAM and HQ-SAM by 2.336 and 7.946\nclicks, respectively. When applied to the HQSeg-44k dataset, our system\ndelivers state-of-the-art results with a NoC@90 of 6.00 and NoC@95 of 9.89. In\naddition to that, we test our model on a novel dataset containing masks for\nhumans during skiing.\n","authors":["Robin Schön","Julian Lorenz","Daniel Kienzle","Rainer Lienhart"],"pdf_url":"https://arxiv.org/pdf/2501.07960v1.pdf","comment":"4 figures, 6 tables, 12 pages"},{"id":"http://arxiv.org/abs/2501.07957v1","updated":"2025-01-14T09:21:17Z","published":"2025-01-14T09:21:17Z","title":"AI Guide Dog: Egocentric Path Prediction on Smartphone","summary":" This paper introduces AI Guide Dog (AIGD), a lightweight egocentric\nnavigation assistance system for visually impaired individuals, designed for\nreal-time deployment on smartphones. AIGD addresses key challenges in blind\nnavigation by employing a vision-only, multi-label classification approach to\npredict directional commands, ensuring safe traversal across diverse\nenvironments. We propose a novel technique to enable goal-based outdoor\nnavigation by integrating GPS signals and high-level directions, while also\naddressing uncertain multi-path predictions for destination-free indoor\nnavigation. Our generalized model is the first navigation assistance system to\nhandle both goal-oriented and exploratory navigation scenarios across indoor\nand outdoor settings, establishing a new state-of-the-art in blind navigation.\nWe present methods, datasets, evaluations, and deployment insights to encourage\nfurther innovations in assistive navigation systems.\n","authors":["Aishwarya Jadhav","Jeffery Cao","Abhishree Shetty","Urvashi Priyam Kumar","Aditi Sharma","Ben Sukboontip","Jayant Sravan Tamarapalli","Jingyi Zhang","Anirudh Koul"],"pdf_url":"https://arxiv.org/pdf/2501.07957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04665v3","updated":"2025-01-14T09:11:42Z","published":"2025-01-08T18:22:44Z","title":"HyFusion: Enhanced Reception Field Transformer for Hyperspectral Image\n Fusion","summary":" Hyperspectral image (HSI) fusion addresses the challenge of reconstructing\nHigh-Resolution HSIs (HR-HSIs) from High-Resolution Multispectral images\n(HR-MSIs) and Low-Resolution HSIs (LR-HSIs), a critical task given the high\ncosts and hardware limitations associated with acquiring high-quality HSIs.\nWhile existing methods leverage spatial and spectral relationships, they often\nsuffer from limited receptive fields and insufficient feature utilization,\nleading to suboptimal performance. Furthermore, the scarcity of high-quality\nHSI data highlights the importance of efficient data utilization to maximize\nreconstruction quality. To address these issues, we propose HyFusion, a novel\nDual-Coupled Network (DCN) framework designed to enhance cross-domain feature\nextraction and enable effective feature map reusing. The framework first\nprocesses HR-MSI and LR-HSI inputs through specialized subnetworks that\nmutually enhance each other during feature extraction, preserving complementary\nspatial and spectral details. At its core, HyFusion utilizes an Enhanced\nReception Field Block (ERFB), which combines shifting-window attention and\ndense connections to expand the receptive field, effectively capturing\nlong-range dependencies while minimizing information loss. Extensive\nexperiments demonstrate that HyFusion achieves state-of-the-art performance in\nHR-MSI/LR-HSI fusion, significantly improving reconstruction quality while\nmaintaining a compact model size and computational efficiency. By integrating\nenhanced receptive fields and feature map reusing into a coupled network\narchitecture, HyFusion provides a practical and effective solution for HSI\nfusion in resource-constrained scenarios, setting a new benchmark in\nhyperspectral imaging. Our code will be publicly available.\n","authors":["Chia-Ming Lee","Yu-Fan Lin","Yu-Hao Ho","Li-Wei Kang","Chih-Chung Hsu"],"pdf_url":"https://arxiv.org/pdf/2501.04665v3.pdf","comment":"Submitted to IGARSS 2025"},{"id":"http://arxiv.org/abs/2501.07953v1","updated":"2025-01-14T09:09:14Z","published":"2025-01-14T09:09:14Z","title":"Robust Hyperspectral Image Panshapring via Sparse Spatial-Spectral\n Representation","summary":" High-resolution hyperspectral imaging plays a crucial role in various remote\nsensing applications, yet its acquisition often faces fundamental limitations\ndue to hardware constraints. This paper introduces S$^{3}$RNet, a novel\nframework for hyperspectral image pansharpening that effectively combines\nlow-resolution hyperspectral images (LRHSI) with high-resolution multispectral\nimages (HRMSI) through sparse spatial-spectral representation. The core of\nS$^{3}$RNet is the Multi-Branch Fusion Network (MBFN), which employs parallel\nbranches to capture complementary features at different spatial and spectral\nscales. Unlike traditional approaches that treat all features equally, our\nSpatial-Spectral Attention Weight Block (SSAWB) dynamically adjusts feature\nweights to maintain sparse representation while suppressing noise and\nredundancy. To enhance feature propagation, we incorporate the Dense Feature\nAggregation Block (DFAB), which efficiently aggregates inputted features\nthrough dense connectivity patterns. This integrated design enables S$^{3}$RNet\nto selectively emphasize the most informative features from differnt scale\nwhile maintaining computational efficiency. Comprehensive experiments\ndemonstrate that S$^{3}$RNet achieves state-of-the-art performance across\nmultiple evaluation metrics, showing particular strength in maintaining high\nreconstruction quality even under challenging noise conditions. The code will\nbe made publicly available.\n","authors":["Chia-Ming Lee","Yu-Fan Lin","Li-Wei Kang","Chih-Chung Hsu"],"pdf_url":"https://arxiv.org/pdf/2501.07953v1.pdf","comment":"Submitted to IGARSS 2025"},{"id":"http://arxiv.org/abs/2501.00700v2","updated":"2025-01-14T09:04:35Z","published":"2025-01-01T02:18:18Z","title":"Knowledge-Guided Prompt Learning for Deepfake Facial Image Detection","summary":" Recent generative models demonstrate impressive performance on synthesizing\nphotographic images, which makes humans hardly to distinguish them from\npristine ones, especially on realistic-looking synthetic facial images.\nPrevious works mostly focus on mining discriminative artifacts from vast amount\nof visual data. However, they usually lack the exploration of prior knowledge\nand rarely pay attention to the domain shift between training categories (e.g.,\nnatural and indoor objects) and testing ones (e.g., fine-grained human facial\nimages), resulting in unsatisfactory detection performance. To address these\nissues, we propose a novel knowledge-guided prompt learning method for deepfake\nfacial image detection. Specifically, we retrieve forgery-related prompts from\nlarge language models as expert knowledge to guide the optimization of\nlearnable prompts. Besides, we elaborate test-time prompt tuning to alleviate\nthe domain shift, achieving significant performance improvement and boosting\nthe application in real-world scenarios. Extensive experiments on\nDeepFakeFaceForensics dataset show that our proposed approach notably\noutperforms state-of-the-art methods.\n","authors":["Hao Wang","Cheng Deng","Zhidong Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.00700v2.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2305.11421v3","updated":"2025-01-14T08:59:17Z","published":"2023-05-19T04:16:50Z","title":"PastNet: Introducing Physical Inductive Biases for Spatio-temporal Video\n Prediction","summary":" In this paper, we investigate the challenge of spatio-temporal video\nprediction task, which involves generating future video frames based on\nhistorical spatio-temporal observation streams. Existing approaches typically\nutilize external information such as semantic maps to improve video prediction\naccuracy, which often neglect the inherent physical knowledge embedded within\nvideos. Worse still, their high computational costs could impede their\napplications for high-resolution videos. To address these constraints, we\nintroduce a novel framework called \\underline{P}hysics-\\underline{a}ssisted\n\\underline{S}patio-\\underline{t}emporal \\underline{Net}work (PastNet) for\nhigh-quality video prediction. The core of PastNet lies in incorporating a\nspectral convolution operator in the Fourier domain, which efficiently\nintroduces inductive biases from the underlying physical laws. Additionally, we\nemploy a memory bank with the estimated intrinsic dimensionality to discretize\nlocal features during the processing of complex spatio-temporal signals,\nthereby reducing computational costs and facilitating efficient high-resolution\nvideo prediction. Extensive experiments on various widely-used spatio-temporal\nvideo benchmarks demonstrate the effectiveness and efficiency of the proposed\nPastNet compared with a range of state-of-the-art methods, particularly in\nhigh-resolution scenarios.\n","authors":["Hao Wu","Fan Xu","Chong Chen","Xian-Sheng Hua","Xiao Luo","Haixin Wang"],"pdf_url":"https://arxiv.org/pdf/2305.11421v3.pdf","comment":"11"},{"id":"http://arxiv.org/abs/2501.07945v1","updated":"2025-01-14T08:56:59Z","published":"2025-01-14T08:56:59Z","title":"Early prediction of the transferability of bovine embryos from\n videomicroscopy","summary":" Videomicroscopy is a promising tool combined with machine learning for\nstudying the early development of in vitro fertilized bovine embryos and\nassessing its transferability as soon as possible. We aim to predict the embryo\ntransferability within four days at most, taking 2D time-lapse microscopy\nvideos as input. We formulate this problem as a supervised binary\nclassification problem for the classes transferable and not transferable. The\nchallenges are three-fold: 1) poorly discriminating appearance and motion, 2)\nclass ambiguity, 3) small amount of annotated data. We propose a 3D\nconvolutional neural network involving three pathways, which makes it\nmulti-scale in time and able to handle appearance and motion in different ways.\nFor training, we retain the focal loss. Our model, named SFR, compares\nfavorably to other methods. Experiments demonstrate its effectiveness and\naccuracy for our challenging biological task.\n","authors":["Yasmine Hachani","Patrick Bouthemy","Elisa Fromont","Sylvie Ruffini","Ludivine Laffont","Alline de Paula Reis"],"pdf_url":"https://arxiv.org/pdf/2501.07945v1.pdf","comment":"Accepted at the 2024 IEEE International Conference on Image\n Processing"},{"id":"http://arxiv.org/abs/2501.03659v2","updated":"2025-01-14T08:52:51Z","published":"2025-01-07T09:47:46Z","title":"DehazeGS: Seeing Through Fog with 3D Gaussian Splatting","summary":" Current novel view synthesis tasks primarily rely on high-quality and clear\nimages. However, in foggy scenes, scattering and attenuation can significantly\ndegrade the reconstruction and rendering quality. Although NeRF-based dehazing\nreconstruction algorithms have been developed, their use of deep fully\nconnected neural networks and per-ray sampling strategies leads to high\ncomputational costs. Moreover, NeRF's implicit representation struggles to\nrecover fine details from hazy scenes. In contrast, recent advancements in 3D\nGaussian Splatting achieve high-quality 3D scene reconstruction by explicitly\nmodeling point clouds into 3D Gaussians. In this paper, we propose leveraging\nthe explicit Gaussian representation to explain the foggy image formation\nprocess through a physically accurate forward rendering process. We introduce\nDehazeGS, a method capable of decomposing and rendering a fog-free background\nfrom participating media using only muti-view foggy images as input. We model\nthe transmission within each Gaussian distribution to simulate the formation of\nfog. During this process, we jointly learn the atmospheric light and scattering\ncoefficient while optimizing the Gaussian representation of the hazy scene. In\nthe inference stage, we eliminate the effects of scattering and attenuation on\nthe Gaussians and directly project them onto a 2D plane to obtain a clear view.\nExperiments on both synthetic and real-world foggy datasets demonstrate that\nDehazeGS achieves state-of-the-art performance in terms of both rendering\nquality and computational efficiency.\n","authors":["Jinze Yu","Yiqun Wang","Zhengda Lu","Jianwei Guo","Yong Li","Hongxing Qin","Xiaopeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.03659v2.pdf","comment":"9 pages,4 figures"},{"id":"http://arxiv.org/abs/2409.16793v2","updated":"2025-01-14T08:47:17Z","published":"2024-09-25T10:14:01Z","title":"Spacewalker: Traversing Representation Spaces for Fast Interactive\n Exploration and Annotation of Unstructured Data","summary":" In industries such as healthcare, finance, and manufacturing, analysis of\nunstructured textual data presents significant challenges for analysis and\ndecision making. Uncovering patterns within large-scale corpora and\nunderstanding their semantic impact is critical, but depends on domain experts\nor resource-intensive manual reviews. In response, we introduce Spacewalker in\nthis system demonstration paper, an interactive tool designed to analyze,\nexplore, and annotate data across multiple modalities. It allows users to\nextract data representations, visualize them in low-dimensional spaces and\ntraverse large datasets either exploratory or by querying regions of interest.\nWe evaluated Spacewalker through extensive experiments and annotation studies,\nassessing its efficacy in improving data integrity verification and annotation.\nWe show that Spacewalker reduces time and effort compared to traditional\nmethods. The code of this work is open-source and can be found at:\nhttps://github.com/code-lukas/Spacewalker\n","authors":["Lukas Heine","Fabian Hörst","Jana Fragemann","Gijs Luijten","Jan Egger","Fin Bahnsen","M. Saquib Sarfraz","Jens Kleesiek","Constantin Seibold"],"pdf_url":"https://arxiv.org/pdf/2409.16793v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.06664v3","updated":"2025-01-14T08:33:08Z","published":"2024-12-09T17:01:42Z","title":"Knowledge Transfer and Domain Adaptation for Fine-Grained Remote Sensing\n Image Segmentation","summary":" Fine-grained remote sensing image segmentation is essential for accurately\nidentifying detailed objects in remote sensing images. Recently, vision\ntransformer models (VTMs) pre-trained on large-scale datasets have demonstrated\nstrong zero-shot generalization. However, directly applying them to specific\ntasks may lead to domain shift. We introduce a novel end-to-end learning\nparadigm combining knowledge guidance with domain refinement to enhance\nperformance. We present two key components: the Feature Alignment Module (FAM)\nand the Feature Modulation Module (FMM). FAM aligns features from a CNN-based\nbackbone with those from the pretrained VTM's encoder using channel\ntransformation and spatial interpolation, and transfers knowledge via KL\ndivergence and L2 normalization constraint. FMM further adapts the knowledge to\nthe specific domain to address domain shift. We also introduce a fine-grained\ngrass segmentation dataset and demonstrate, through experiments on two\ndatasets, that our method achieves a significant improvement of 2.57 mIoU on\nthe grass dataset and 3.73 mIoU on the cloud dataset. The results highlight the\npotential of combining knowledge transfer and domain adaptation to overcome\ndomain-related challenges and data limitations. The project page is available\nat https://xavierjiezou.github.io/KTDA/.\n","authors":["Shun Zhang","Xuechao Zou","Kai Li","Congyan Lang","Shiying Wang","Pin Tao","Tengfei Cao"],"pdf_url":"https://arxiv.org/pdf/2412.06664v3.pdf","comment":"6 pages, 3 figures, 6 tables"},{"id":"http://arxiv.org/abs/2412.21079v3","updated":"2025-01-14T08:23:30Z","published":"2024-12-30T16:56:44Z","title":"Edicho: Consistent Image Editing in the Wild","summary":" As a verified need, consistent editing across in-the-wild images remains a\ntechnical challenge arising from various unmanageable factors, like object\nposes, lighting conditions, and photography environments. Edicho steps in with\na training-free solution based on diffusion models, featuring a fundamental\ndesign principle of using explicit image correspondence to direct editing.\nSpecifically, the key components include an attention manipulation module and a\ncarefully refined classifier-free guidance (CFG) denoising strategy, both of\nwhich take into account the pre-estimated correspondence. Such an\ninference-time algorithm enjoys a plug-and-play nature and is compatible to\nmost diffusion-based editing methods, such as ControlNet and BrushNet.\nExtensive results demonstrate the efficacy of Edicho in consistent cross-image\nediting under diverse settings. We will release the code to facilitate future\nstudies.\n","authors":["Qingyan Bai","Hao Ouyang","Yinghao Xu","Qiuyu Wang","Ceyuan Yang","Ka Leong Cheng","Yujun Shen","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2412.21079v3.pdf","comment":"Project page: https://ant-research.github.io/edicho/"},{"id":"http://arxiv.org/abs/2501.07922v1","updated":"2025-01-14T08:12:20Z","published":"2025-01-14T08:12:20Z","title":"VENOM: Text-driven Unrestricted Adversarial Example Generation with\n Diffusion Models","summary":" Adversarial attacks have proven effective in deceiving machine learning\nmodels by subtly altering input images, motivating extensive research in recent\nyears. Traditional methods constrain perturbations within $l_p$-norm bounds,\nbut advancements in Unrestricted Adversarial Examples (UAEs) allow for more\ncomplex, generative-model-based manipulations. Diffusion models now lead UAE\ngeneration due to superior stability and image quality over GANs. However,\nexisting diffusion-based UAE methods are limited to using reference images and\nface challenges in generating Natural Adversarial Examples (NAEs) directly from\nrandom noise, often producing uncontrolled or distorted outputs. In this work,\nwe introduce VENOM, the first text-driven framework for high-quality\nunrestricted adversarial examples generation through diffusion models. VENOM\nunifies image content generation and adversarial synthesis into a single\nreverse diffusion process, enabling high-fidelity adversarial examples without\nsacrificing attack success rate (ASR). To stabilize this process, we\nincorporate an adaptive adversarial guidance strategy with momentum, ensuring\nthat the generated adversarial examples $x^*$ align with the distribution\n$p(x)$ of natural images. Extensive experiments demonstrate that VENOM achieves\nsuperior ASR and image quality compared to prior methods, marking a significant\nadvancement in adversarial example generation and providing insights into model\nvulnerabilities for improved defense development.\n","authors":["Hui Kuurila-Zhang","Haoyu Chen","Guoying Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.07922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10568v3","updated":"2025-01-14T08:01:17Z","published":"2024-03-14T17:47:10Z","title":"MoPE: Mixture of Prompt Experts for Parameter-Efficient and Scalable\n Multimodal Fusion","summary":" Despite the demonstrated parameter efficiency of prompt-based multimodal\nfusion methods, their limited adaptivity and expressiveness often result in\nsuboptimal performance compared to other tuning approaches. In this paper, we\nintroduce the Mixture of Prompt Experts (MoPE), the first technique designed to\novercome these limitations by decomposing standard prompts to capture\ninstance-level features adaptively. Building on this decomposition, MoPE\nenhances prompt fusion's expressiveness by leveraging multimodal pairing priors\nto route the most effective prompt for each instance dynamically. Compared to\nvanilla prompting, our MoPE-based fusion method exhibits greater\nexpressiveness, scaling more effectively with the training data and the overall\nnumber of trainable parameters. We also investigate regularization terms for\nexpert routing, which lead to emergent expert specialization with enhanced\nadaptiveness and interpretablity. Extensive experiments across six multimodal\ndatasets spanning four modalities demonstrate state-of-the-art performance for\nprompt fusion, matching or even surpassing the performance of fine-tuning while\nrequiring only 0.8% of the trainable parameters. Project homepage:\nhttps://github.com/songrise/MoPE\n","authors":["Ruixiang Jiang","Lingbo Liu","Changwen Chen"],"pdf_url":"https://arxiv.org/pdf/2403.10568v3.pdf","comment":"Under Review, Extended version of arxiv:2312.03734"},{"id":"http://arxiv.org/abs/2501.07901v1","updated":"2025-01-14T07:35:14Z","published":"2025-01-14T07:35:14Z","title":"Cloud Removal With PolSAR-Optical Data Fusion Using A Two-Flow Residual\n Network","summary":" Optical remote sensing images play a crucial role in the observation of the\nEarth's surface. However, obtaining complete optical remote sensing images is\nchallenging due to cloud cover. Reconstructing cloud-free optical images has\nbecome a major task in recent years. This paper presents a two-flow\nPolarimetric Synthetic Aperture Radar (PolSAR)-Optical data fusion cloud\nremoval algorithm (PODF-CR), which achieves the reconstruction of missing\noptical images. PODF-CR consists of an encoding module and a decoding module.\nThe encoding module includes two parallel branches that extract PolSAR image\nfeatures and optical image features. To address speckle noise in PolSAR images,\nwe introduce dynamic filters in the PolSAR branch for image denoising. To\nbetter facilitate the fusion between multimodal optical images and PolSAR\nimages, we propose fusion blocks based on cross-skip connections to enable\ninteraction of multimodal data information. The obtained fusion features are\nrefined through an attention mechanism to provide better conditions for the\nsubsequent decoding of the fused images. In the decoding module, multi-scale\nconvolution is introduced to obtain multi-scale information. Additionally, to\nbetter utilize comprehensive scattering information and polarization\ncharacteristics to assist in the restoration of optical images, we use a\ndataset for cloud restoration called OPT-BCFSAR-PFSAR, which includes\nbackscatter coefficient feature images and polarization feature images obtained\nfrom PoLSAR data and optical images. Experimental results demonstrate that this\nmethod outperforms existing methods in both qualitative and quantitative\nevaluations.\n","authors":["Yuxi Wang","Wenjuan Zhang","Bing Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07898v1","updated":"2025-01-14T07:26:55Z","published":"2025-01-14T07:26:55Z","title":"Demographic Variability in Face Image Quality Measures","summary":" Face image quality assessment (FIQA) algorithms are being integrated into\nonline identity management applications. These applications allow users to\nupload a face image as part of their document issuance process, where the image\nis then run through a quality assessment process to make sure it meets the\nquality and compliance requirements. Concerns about demographic bias have been\nraised about biometric systems, given the societal implications this may cause.\nIt is therefore important that demographic variability in FIQA algorithms is\nassessed such that mitigation measures can be created. In this work, we study\nthe demographic variability of all face image quality measures included in the\nISO/IEC 29794-5 international standard across three demographic variables: age,\ngender, and skin tone. The results are rather promising and show no clear bias\ntoward any specific demographic group for most measures. Only two quality\nmeasures are found to have considerable variations in their outcomes for\ndifferent groups on the skin tone variable.\n","authors":["Wassim Kabbani","Kiran Raja","Raghavendra Ramachandra","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2501.07898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07888v1","updated":"2025-01-14T06:54:39Z","published":"2025-01-14T06:54:39Z","title":"Tarsier2: Advancing Large Vision-Language Models from Detailed Video\n Description to Comprehensive Video Understanding","summary":" We introduce Tarsier2, a state-of-the-art large vision-language model (LVLM)\ndesigned for generating detailed and accurate video descriptions, while also\nexhibiting superior general video understanding capabilities. Tarsier2 achieves\nsignificant advancements through three key upgrades: (1) Scaling pre-training\ndata from 11M to 40M video-text pairs, enriching both volume and diversity; (2)\nPerforming fine-grained temporal alignment during supervised fine-tuning; (3)\nUsing model-based sampling to automatically construct preference data and\napplying DPO training for optimization. Extensive experiments show that\nTarsier2-7B consistently outperforms leading proprietary models, including\nGPT-4o and Gemini 1.5 Pro, in detailed video description tasks. On the DREAM-1K\nbenchmark, Tarsier2-7B improves F1 by 2.8\\% over GPT-4o and 5.8\\% over\nGemini-1.5-Pro. In human side-by-side evaluations, Tarsier2-7B shows a +8.6\\%\nperformance advantage over GPT-4o and +24.9\\% over Gemini-1.5-Pro. Tarsier2-7B\nalso sets new state-of-the-art results across 15 public benchmarks, spanning\ntasks such as video question-answering, video grounding, hallucination test,\nand embodied question-answering, demonstrating its versatility as a robust\ngeneralist vision-language model.\n","authors":["Liping Yuan","Jiawei Wang","Haomiao Sun","Yuchen Zhang","Yuan Lin"],"pdf_url":"https://arxiv.org/pdf/2501.07888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07885v1","updated":"2025-01-14T06:51:27Z","published":"2025-01-14T06:51:27Z","title":"Mitigating Algorithmic Bias in Multiclass CNN Classifications Using\n Causal Modeling","summary":" This study describes a procedure for applying causal modeling to detect and\nmitigate algorithmic bias in a multiclass classification problem. The dataset\nwas derived from the FairFace dataset, supplemented with emotional labels\ngenerated by the DeepFace pre-trained model. A custom Convolutional Neural\nNetwork (CNN) was developed, consisting of four convolutional blocks, followed\nby fully connected layers and dropout layers to mitigate overfitting. Gender\nbias was identified in the CNN model's classifications: Females were more\nlikely to be classified as \"happy\" or \"sad,\" while males were more likely to be\nclassified as \"neutral.\" To address this, the one-vs-all (OvA) technique was\napplied. A causal model was constructed for each emotion class to adjust the\nCNN model's predicted class probabilities. The adjusted probabilities for the\nvarious classes were then aggregated by selecting the class with the highest\nprobability. The resulting debiased classifications demonstrated enhanced\ngender fairness across all classes, with negligible impact--or even a slight\nimprovement--on overall accuracy. This study highlights that algorithmic\nfairness and accuracy are not necessarily trade-offs. All data and code for\nthis study are publicly available for download.\n","authors":["Min Sik Byun","Wendy Wan Yee Hui","Wai Kwong Lau"],"pdf_url":"https://arxiv.org/pdf/2501.07885v1.pdf","comment":"7 pages; 6 figures"},{"id":"http://arxiv.org/abs/2501.07171v2","updated":"2025-01-14T06:46:14Z","published":"2025-01-13T09:58:03Z","title":"BIOMEDICA: An Open Biomedical Image-Caption Archive, Dataset, and\n Vision-Language Models Derived from Scientific Literature","summary":" The development of vision-language models (VLMs) is driven by large-scale and\ndiverse multimodal datasets. However, progress toward generalist biomedical\nVLMs is limited by the lack of annotated, publicly accessible datasets across\nbiology and medicine. Existing efforts are restricted to narrow domains,\nmissing the full diversity of biomedical knowledge encoded in scientific\nliterature. To address this gap, we introduce BIOMEDICA, a scalable,\nopen-source framework to extract, annotate, and serialize the entirety of the\nPubMed Central Open Access subset into an easy-to-use, publicly accessible\ndataset. Our framework produces a comprehensive archive with over 24 million\nunique image-text pairs from over 6 million articles. Metadata and\nexpert-guided annotations are also provided. We demonstrate the utility and\naccessibility of our resource by releasing BMCA-CLIP, a suite of CLIP-style\nmodels continuously pre-trained on the BIOMEDICA dataset via streaming,\neliminating the need to download 27 TB of data locally. On average, our models\nachieve state-of-the-art performance across 40 tasks - spanning pathology,\nradiology, ophthalmology, dermatology, surgery, molecular biology,\nparasitology, and cell biology - excelling in zero-shot classification with a\n6.56% average improvement (as high as 29.8% and 17.5% in dermatology and\nophthalmology, respectively), and stronger image-text retrieval, all while\nusing 10x less compute. To foster reproducibility and collaboration, we release\nour codebase and dataset for the broader research community.\n","authors":["Alejandro Lozano","Min Woo Sun","James Burgess","Liangyu Chen","Jeffrey J Nirschl","Jeffrey Gu","Ivan Lopez","Josiah Aklilu","Austin Wolfgang Katzer","Collin Chiu","Anita Rau","Xiaohan Wang","Yuhui Zhang","Alfred Seunghoon Song","Robert Tibshirani","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2501.07171v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07870v1","updated":"2025-01-14T06:21:31Z","published":"2025-01-14T06:21:31Z","title":"Make-A-Character 2: Animatable 3D Character Generation From a Single\n Image","summary":" This report introduces Make-A-Character 2, an advanced system for generating\nhigh-quality 3D characters from single portrait photographs, ideal for game\ndevelopment and digital human applications. Make-A-Character 2 builds upon its\npredecessor by incorporating several significant improvements for image-based\nhead generation. We utilize the IC-Light method to correct non-ideal\nillumination in input photos and apply neural network-based color correction to\nharmonize skin tones between the photos and game engine renders. We also employ\nthe Hierarchical Representation Network to capture high-frequency facial\nstructures and conduct adaptive skeleton calibration for accurate and\nexpressive facial animations. The entire image-to-3D-character generation\nprocess takes less than 2 minutes. Furthermore, we leverage transformer\narchitecture to generate co-speech facial and gesture actions, enabling\nreal-time conversation with the generated character. These technologies have\nbeen integrated into our conversational AI avatar products.\n","authors":["Lin Liu","Yutong Wang","Jiahao Chen","Jianfang Li","Tangli Xue","Longlong Li","Jianqiang Ren","Liefeng Bo"],"pdf_url":"https://arxiv.org/pdf/2501.07870v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2501.07859v1","updated":"2025-01-14T05:55:20Z","published":"2025-01-14T05:55:20Z","title":"deepTerra -- AI Land Classification Made Easy","summary":" deepTerra is a comprehensive platform designed to facilitate the\nclassification of land surface features using machine learning and satellite\nimagery. The platform includes modules for data collection, image augmentation,\ntraining, testing, and prediction, streamlining the entire workflow for image\nclassification tasks. This paper presents a detailed overview of the\ncapabilities of deepTerra, shows how it has been applied to various research\nareas, and discusses the future directions it might take.\n","authors":["Andrew Keith Wilkinson"],"pdf_url":"https://arxiv.org/pdf/2501.07859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00836v2","updated":"2025-01-14T05:49:16Z","published":"2025-01-01T13:38:15Z","title":"Recognizing Artistic Style of Archaeological Image Fragments Using Deep\n Style Extrapolation","summary":" Ancient artworks obtained in archaeological excavations usually suffer from a\ncertain degree of fragmentation and physical degradation. Often, fragments of\nmultiple artifacts from different periods or artistic styles could be found on\nthe same site. With each fragment containing only partial information about its\nsource, and pieces from different objects being mixed, categorizing broken\nartifacts based on their visual cues could be a challenging task, even for\nprofessionals. As classification is a common function of many machine learning\nmodels, the power of modern architectures can be harnessed for efficient and\naccurate fragment classification. In this work, we present a generalized\ndeep-learning framework for predicting the artistic style of image fragments,\nachieving state-of-the-art results for pieces with varying styles and\ngeometries.\n","authors":["Gur Elkin","Ofir Itzhak Shahar","Yaniv Ohayon","Nadav Alali","Ohad Ben-Shahar"],"pdf_url":"https://arxiv.org/pdf/2501.00836v2.pdf","comment":"To be published in the 27th International Conference on\n Human-Computer Interaction (HCII 2025)"},{"id":"http://arxiv.org/abs/2501.07855v1","updated":"2025-01-14T05:43:59Z","published":"2025-01-14T05:43:59Z","title":"State-of-the-Art Transformer Models for Image Super-Resolution:\n Techniques, Challenges, and Applications","summary":" Image Super-Resolution (SR) aims to recover a high-resolution image from its\nlow-resolution counterpart, which has been affected by a specific degradation\nprocess. This is achieved by enhancing detail and visual quality. Recent\nadvancements in transformer-based methods have remolded image super-resolution\nby enabling high-quality reconstructions surpassing previous deep-learning\napproaches like CNN and GAN-based. This effectively addresses the limitations\nof previous methods, such as limited receptive fields, poor global context\ncapture, and challenges in high-frequency detail recovery. Additionally, the\npaper reviews recent trends and advancements in transformer-based SR models,\nexploring various innovative techniques and architectures that combine\ntransformers with traditional networks to balance global and local contexts.\nThese neoteric methods are critically analyzed, revealing promising yet\nunexplored gaps and potential directions for future research. Several\nvisualizations of models and techniques are included to foster a holistic\nunderstanding of recent trends. This work seeks to offer a structured roadmap\nfor researchers at the forefront of deep learning, specifically exploring the\nimpact of transformers on super-resolution techniques.\n","authors":["Debasish Dutta","Deepjyoti Chetia","Neeharika Sonowal","Sanjib Kr Kalita"],"pdf_url":"https://arxiv.org/pdf/2501.07855v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2501.07850v1","updated":"2025-01-14T05:23:42Z","published":"2025-01-14T05:23:42Z","title":"An Intra- and Cross-frame Topological Consistency Scheme for\n Semi-supervised Atherosclerotic Coronary Plaque Segmentation","summary":" Enhancing the precision of segmenting coronary atherosclerotic plaques from\nCT Angiography (CTA) images is pivotal for advanced Coronary Atherosclerosis\nAnalysis (CAA), which distinctively relies on the analysis of vessel\ncross-section images reconstructed via Curved Planar Reformation. This task\npresents significant challenges due to the indistinct boundaries and structures\nof plaques and blood vessels, leading to the inadequate performance of current\ndeep learning models, compounded by the inherent difficulty in annotating such\ncomplex data. To address these issues, we propose a novel dual-consistency\nsemi-supervised framework that integrates Intra-frame Topological Consistency\n(ITC) and Cross-frame Topological Consistency (CTC) to leverage labeled and\nunlabeled data. ITC employs a dual-task network for simultaneous segmentation\nmask and Skeleton-aware Distance Transform (SDT) prediction, achieving similar\nprediction of topology structure through consistency constraint without\nadditional annotations. Meanwhile, CTC utilizes an unsupervised estimator for\nanalyzing pixel flow between skeletons and boundaries of adjacent frames,\nensuring spatial continuity. Experiments on two CTA datasets show that our\nmethod surpasses existing semi-supervised methods and approaches the\nperformance of supervised methods on CAA. In addition, our method also performs\nbetter than other methods on the ACDC dataset, demonstrating its\ngeneralization.\n","authors":["Ziheng Zhang","Zihan Li","Dandan Shan","Yuehui Qiu","Qingqi Hong","Qingqiang Wu"],"pdf_url":"https://arxiv.org/pdf/2501.07850v1.pdf","comment":"Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.06480v2","updated":"2025-01-14T04:16:54Z","published":"2025-01-11T08:13:13Z","title":"Flash Window Attention: speedup the attention computation for Swin\n Transformer","summary":" To address the high resolution of image pixels, the Swin Transformer\nintroduces window attention. This mechanism divides an image into\nnon-overlapping windows and restricts attention computation to within each\nwindow, significantly enhancing computational efficiency. To further optimize\nthis process, one might consider replacing standard attention with flash\nattention, which has proven to be more efficient in language models. However, a\ndirect substitution is ineffective. Flash attention is designed for long\nsequences, whereas window attention deals with shorter sequences but must\nhandle numerous of them in parallel. In this report, we present an optimized\nsolution called Flash Window Attention, tailored specifically for window\nattention. Flash Window Attention improves attention computation efficiency by\nup to 300% and enhances end-to-end runtime efficiency by up to 30%. Our code is\navailable online.\n","authors":["Zhendong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.06480v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.15975v4","updated":"2025-01-14T03:55:17Z","published":"2022-11-29T07:18:32Z","title":"Analyzing Infrastructure LiDAR Placement with Realistic LiDAR Simulation\n Library","summary":" Recently, Vehicle-to-Everything(V2X) cooperative perception has attracted\nincreasing attention. Infrastructure sensors play a critical role in this\nresearch field; however, how to find the optimal placement of infrastructure\nsensors is rarely studied. In this paper, we investigate the problem of\ninfrastructure sensor placement and propose a pipeline that can efficiently and\neffectively find optimal installation positions for infrastructure sensors in a\nrealistic simulated environment. To better simulate and evaluate LiDAR\nplacement, we establish a Realistic LiDAR Simulation library that can simulate\nthe unique characteristics of different popular LiDARs and produce\nhigh-fidelity LiDAR point clouds in the CARLA simulator. Through simulating\npoint cloud data in different LiDAR placements, we can evaluate the perception\naccuracy of these placements using multiple detection models. Then, we analyze\nthe correlation between the point cloud distribution and perception accuracy by\ncalculating the density and uniformity of regions of interest. Experiments show\nthat when using the same number and type of LiDAR, the placement scheme\noptimized by our proposed method improves the average precision by 15%,\ncompared with the conventional placement scheme in the standard lane scene. We\nalso analyze the correlation between perception performance in the region of\ninterest and LiDAR point cloud distribution and validate that density and\nuniformity can be indicators of performance. Both the RLS Library and related\ncode will be released at https://github.com/PJLab-ADG/PCSim.\n","authors":["Xinyu Cai","Wentao Jiang","Runsheng Xu","Wenquan Zhao","Jiaqi Ma","Si Liu","Yikang Li"],"pdf_url":"https://arxiv.org/pdf/2211.15975v4.pdf","comment":"7 pages, 6 figures, accepted to the IEEE International Conference on\n Robotics and Automation (ICRA'23)"},{"id":"http://arxiv.org/abs/2501.07819v1","updated":"2025-01-14T03:50:23Z","published":"2025-01-14T03:50:23Z","title":"3UR-LLM: An End-to-End Multimodal Large Language Model for 3D Scene\n Understanding","summary":" Multi-modal Large Language Models (MLLMs) exhibit impressive capabilities in\n2D tasks, yet encounter challenges in discerning the spatial positions,\ninterrelations, and causal logic in scenes when transitioning from 2D to 3D\nrepresentations. We find that the limitations mainly lie in: i) the high\nannotation cost restricting the scale-up of volumes of 3D scene data, and ii)\nthe lack of a straightforward and effective way to perceive 3D information\nwhich results in prolonged training durations and complicates the streamlined\nframework. To this end, we develop pipeline based on open-source 2D MLLMs and\nLLMs to generate high-quality 3D-text pairs and construct 3DS-160K , to enhance\nthe pre-training process. Leveraging this high-quality pre-training data, we\nintroduce the 3UR-LLM model, an end-to-end 3D MLLM designed for precise\ninterpretation of 3D scenes, showcasing exceptional capability in navigating\nthe complexities of the physical world. 3UR-LLM directly receives 3D point\ncloud as input and project 3D features fused with text instructions into a\nmanageable set of tokens. Considering the computation burden derived from these\nhybrid tokens, we design a 3D compressor module to cohesively compress the 3D\nspatial cues and textual narrative. 3UR-LLM achieves promising performance with\nrespect to the previous SOTAs, for instance, 3UR-LLM exceeds its counterparts\nby 7.1\\% CIDEr on ScanQA, while utilizing fewer training resources. The code\nand model weights for 3UR-LLM and the 3DS-160K benchmark are available at\n3UR-LLM.\n","authors":["Haomiao Xiong","Yunzhi Zhuge","Jiawen Zhu","Lu Zhang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2501.07819v1.pdf","comment":"Accepted to IEEE Transactions on Multimedia (TMM)"},{"id":"http://arxiv.org/abs/2412.10106v4","updated":"2025-01-14T03:43:02Z","published":"2024-12-13T12:47:30Z","title":"A Cascaded Dilated Convolution Approach for Mpox Lesion Classification","summary":" The global outbreak of the Mpox virus, classified as a Public Health\nEmergency of International Concern (PHEIC) by the World Health Organization,\npresents significant diagnostic challenges due to its visual similarity to\nother skin lesion diseases. Traditional diagnostic methods for Mpox, which rely\non clinical symptoms and laboratory tests, are slow and labor intensive. Deep\nlearning-based approaches for skin lesion classification offer a promising\nalternative. However, developing a model that balances efficiency with accuracy\nis crucial to ensure reliable and timely diagnosis without compromising\nperformance. This study introduces the Cascaded Atrous Group Attention (CAGA)\nframework to address these challenges, combining the Cascaded Atrous Attention\nmodule and the Cascaded Group Attention mechanism. The Cascaded Atrous\nAttention module utilizes dilated convolutions and cascades the outputs to\nenhance multi-scale representation. This is integrated into the Cascaded Group\nAttention mechanism, which reduces redundancy in Multi-Head Self-Attention. By\nintegrating the Cascaded Atrous Group Attention module with EfficientViT-L1 as\nthe backbone architecture, this approach achieves state-of-the-art performance,\nreaching an accuracy of 98% on the Mpox Close Skin Image (MCSI) dataset while\nreducing model parameters by 37.5% compared to the original EfficientViT-L1.\nThe model's robustness is demonstrated through extensive validation on two\nadditional benchmark datasets, where it consistently outperforms existing\napproaches.\n","authors":["Ayush Deshmukh"],"pdf_url":"https://arxiv.org/pdf/2412.10106v4.pdf","comment":"8 pages, 4 figures, Submitted to Medical Imaging with Deep Learning"},{"id":"http://arxiv.org/abs/2501.07810v1","updated":"2025-01-14T03:20:20Z","published":"2025-01-14T03:20:20Z","title":"AVS-Mamba: Exploring Temporal and Multi-modal Mamba for Audio-Visual\n Segmentation","summary":" The essence of audio-visual segmentation (AVS) lies in locating and\ndelineating sound-emitting objects within a video stream. While\nTransformer-based methods have shown promise, their handling of long-range\ndependencies struggles due to quadratic computational costs, presenting a\nbottleneck in complex scenarios. To overcome this limitation and facilitate\ncomplex multi-modal comprehension with linear complexity, we introduce\nAVS-Mamba, a selective state space model to address the AVS task. Our framework\nincorporates two key components for video understanding and cross-modal\nlearning: Temporal Mamba Block for sequential video processing and\nVision-to-Audio Fusion Block for advanced audio-vision integration. Building on\nthis, we develop the Multi-scale Temporal Encoder, aimed at enhancing the\nlearning of visual features across scales, facilitating the perception of\nintra- and inter-frame information. To perform multi-modal fusion, we propose\nthe Modality Aggregation Decoder, leveraging the Vision-to-Audio Fusion Block\nto integrate visual features into audio features across both frame and temporal\nlevels. Further, we adopt the Contextual Integration Pyramid to perform\naudio-to-vision spatial-temporal context collaboration. Through these\ninnovative contributions, our approach achieves new state-of-the-art results on\nthe AVSBench-object and AVSBench-semantic datasets. Our source code and model\nweights are available at AVS-Mamba.\n","authors":["Sitong Gong","Yunzhi Zhuge","Lu Zhang","Yifan Wang","Pingping Zhang","Lijun Wang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2501.07810v1.pdf","comment":"Accepted to IEEE Transactions on Multimedia (TMM)"},{"id":"http://arxiv.org/abs/2501.07808v1","updated":"2025-01-14T03:19:10Z","published":"2025-01-14T03:19:10Z","title":"A Low-cost and Ultra-lightweight Binary Neural Network for Traffic\n Signal Recognition","summary":" The deployment of neural networks in vehicle platforms and wearable\nArtificial Intelligence-of-Things (AIOT) scenarios has become a research area\nthat has attracted much attention. With the continuous evolution of deep\nlearning technology, many image classification models are committed to\nimproving recognition accuracy, but this is often accompanied by problems such\nas large model resource usage, complex structure, and high power consumption,\nwhich makes it challenging to deploy on resource-constrained platforms. Herein,\nwe propose an ultra-lightweight binary neural network (BNN) model designed for\nhardware deployment, and conduct image classification research based on the\nGerman Traffic Sign Recognition Benchmark (GTSRB) dataset. In addition, we also\nverify it on the Chinese Traffic Sign (CTS) and Belgian Traffic Sign (BTS)\ndatasets. The proposed model shows excellent recognition performance with an\naccuracy of up to 97.64%, making it one of the best performing BNN models in\nthe GTSRB dataset. Compared with the full-precision model, the accuracy loss is\ncontrolled within 1%, and the parameter storage overhead of the model is only\n10% of that of the full-precision model. More importantly, our network model\nonly relies on logical operations and low-bit width fixed-point addition and\nsubtraction operations during the inference phase, which greatly simplifies the\ndesign complexity of the processing element (PE). Our research shows the great\npotential of BNN in the hardware deployment of computer vision models,\nespecially in the field of computer vision tasks related to autonomous driving.\n","authors":["Mingke Xiao","Yue Su","Liang Yu","Guanglong Qu","Yutong Jia","Yukuan Chang","Xu Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07808v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07806v1","updated":"2025-01-14T03:15:46Z","published":"2025-01-14T03:15:46Z","title":"Learning Motion and Temporal Cues for Unsupervised Video Object\n Segmentation","summary":" In this paper, we address the challenges in unsupervised video object\nsegmentation (UVOS) by proposing an efficient algorithm, termed MTNet, which\nconcurrently exploits motion and temporal cues. Unlike previous methods that\nfocus solely on integrating appearance with motion or on modeling temporal\nrelations, our method combines both aspects by integrating them within a\nunified framework. MTNet is devised by effectively merging appearance and\nmotion features during the feature extraction process within encoders,\npromoting a more complementary representation. To capture the intricate\nlong-range contextual dynamics and information embedded within videos, a\ntemporal transformer module is introduced, facilitating efficacious inter-frame\ninteractions throughout a video clip. Furthermore, we employ a cascade of\ndecoders all feature levels across all feature levels to optimally exploit the\nderived features, aiming to generate increasingly precise segmentation masks.\nAs a result, MTNet provides a strong and compact framework that explores both\ntemporal and cross-modality knowledge to robustly localize and track the\nprimary object accurately in various challenging scenarios efficiently.\nExtensive experiments across diverse benchmarks conclusively show that our\nmethod not only attains state-of-the-art performance in unsupervised video\nobject segmentation but also delivers competitive results in video salient\nobject detection. These findings highlight the method's robust versatility and\nits adeptness in adapting to a range of segmentation tasks. Source code is\navailable on https://github.com/hy0523/MTNet.\n","authors":["Yunzhi Zhuge","Hongyu Gu","Lu Zhang","Jinqing Qi","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2501.07806v1.pdf","comment":"Accepted to IEEE Transactions on Neural Networks and Learning Systems\n (TNNLS)"},{"id":"http://arxiv.org/abs/2501.07804v1","updated":"2025-01-14T03:12:25Z","published":"2025-01-14T03:12:25Z","title":"Balance Divergence for Knowledge Distillation","summary":" Knowledge distillation has been widely adopted in computer vision task\nprocessing, since it can effectively enhance the performance of lightweight\nstudent networks by leveraging the knowledge transferred from cumbersome\nteacher networks. Most existing knowledge distillation methods utilize\nKullback-Leibler divergence to mimic the logit output probabilities between the\nteacher network and the student network. Nonetheless, these methods may neglect\nthe negative parts of the teacher's ''dark knowledge'' because the divergence\ncalculations may ignore the effect of the minute probabilities from the\nteacher's logit output. This deficiency may lead to suboptimal performance in\nlogit mimicry during the distillation process and result in an imbalance of\ninformation acquired by the student network. In this paper, we investigate the\nimpact of this imbalance and propose a novel method, named Balance Divergence\nDistillation. By introducing a compensatory operation using reverse\nKullback-Leibler divergence, our method can improve the modeling of the\nextremely small values in the negative from the teacher and preserve the\nlearning capacity for the positive. Furthermore, we test the impact of\ndifferent temperature coefficients adjustments, which may conducted to further\nbalance for knowledge transferring. We evaluate the proposed method on several\ncomputer vision tasks, including image classification and semantic\nsegmentation. The evaluation results show that our method achieves an accuracy\nimprovement of 1%~3% for lightweight students on both CIFAR-100 and ImageNet\ndataset, and a 4.55% improvement in mIoU for PSP-ResNet18 on the Cityscapes\ndataset. The experiments show that our method is a simple yet highly effective\nsolution that can be smoothly applied to different knowledge distillation\nmethods.\n","authors":["Yafei Qi","Chen Wang","Zhaoning Zhang","Yaping Liu","Yongmin Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07800v1","updated":"2025-01-14T02:56:19Z","published":"2025-01-14T02:56:19Z","title":"BioPose: Biomechanically-accurate 3D Pose Estimation from Monocular\n Videos","summary":" Recent advancements in 3D human pose estimation from single-camera images and\nvideos have relied on parametric models, like SMPL. However, these models\noversimplify anatomical structures, limiting their accuracy in capturing true\njoint locations and movements, which reduces their applicability in\nbiomechanics, healthcare, and robotics. Biomechanically accurate pose\nestimation, on the other hand, typically requires costly marker-based motion\ncapture systems and optimization techniques in specialized labs. To bridge this\ngap, we propose BioPose, a novel learning-based framework for predicting\nbiomechanically accurate 3D human pose directly from monocular videos. BioPose\nincludes three key components: a Multi-Query Human Mesh Recovery model\n(MQ-HMR), a Neural Inverse Kinematics (NeurIK) model, and a 2D-informed pose\nrefinement technique. MQ-HMR leverages a multi-query deformable transformer to\nextract multi-scale fine-grained image features, enabling precise human mesh\nrecovery. NeurIK treats the mesh vertices as virtual markers, applying a\nspatial-temporal network to regress biomechanically accurate 3D poses under\nanatomical constraints. To further improve 3D pose estimations, a 2D-informed\nrefinement step optimizes the query tokens during inference by aligning the 3D\nstructure with 2D pose observations. Experiments on benchmark datasets\ndemonstrate that BioPose significantly outperforms state-of-the-art methods.\nProject website:\n\\url{https://m-usamasaleem.github.io/publication/BioPose/BioPose.html}.\n","authors":["Farnoosh Koleini","Muhammad Usama Saleem","Pu Wang","Hongfei Xue","Ahmed Helmy","Abbey Fenwick"],"pdf_url":"https://arxiv.org/pdf/2501.07800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09323v3","updated":"2025-01-14T02:56:00Z","published":"2024-09-14T05:53:33Z","title":"Implicit Neural Representations with Fourier Kolmogorov-Arnold Networks","summary":" Implicit neural representations (INRs) use neural networks to provide\ncontinuous and resolution-independent representations of complex signals with a\nsmall number of parameters. However, existing INR models often fail to capture\nimportant frequency components specific to each task. To address this issue, in\nthis paper, we propose a Fourier Kolmogorov Arnold network (FKAN) for INRs. The\nproposed FKAN utilizes learnable activation functions modeled as Fourier series\nin the first layer to effectively control and learn the task-specific frequency\ncomponents. In addition, the activation functions with learnable Fourier\ncoefficients improve the ability of the network to capture complex patterns and\ndetails, which is beneficial for high-resolution and high-dimensional data.\nExperimental results show that our proposed FKAN model outperforms three\nstate-of-the-art baseline schemes, and improves the peak signal-to-noise ratio\n(PSNR) and structural similarity index measure (SSIM) for the image\nrepresentation task and intersection over union (IoU) for the 3D occupancy\nvolume representation task, respectively. The code is available at\ngithub.com/Ali-Meh619/FKAN.\n","authors":["Ali Mehrabian","Parsa Mojarad Adi","Moein Heidari","Ilker Hacihaliloglu"],"pdf_url":"https://arxiv.org/pdf/2409.09323v3.pdf","comment":"Accepted for publication in Proc. IEEE ICASSP 2025"},{"id":"http://arxiv.org/abs/2407.02772v2","updated":"2025-01-14T02:30:09Z","published":"2024-07-03T03:01:43Z","title":"Gradient descent with generalized Newton's method","summary":" We propose the generalized Newton's method (GeN) -- a Hessian-informed\napproach that applies to any optimizer such as SGD and Adam, and covers the\nNewton-Raphson method as a sub-case. Our method automatically and dynamically\nselects the learning rate that accelerates the convergence, without the\nintensive tuning of the learning rate scheduler. In practice, our method is\neasily implementable, since it only requires additional forward passes with\nalmost zero computational overhead (in terms of training time and memory cost),\nif the overhead is amortized over many iterations. We present extensive\nexperiments on language and vision tasks (e.g. GPT and ResNet) to showcase that\nGeN optimizers match the state-of-the-art performance, which was achieved with\ncarefully tuned learning rate schedulers.\n","authors":["Zhiqi Bu","Shiyun Xu"],"pdf_url":"https://arxiv.org/pdf/2407.02772v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.15761v2","updated":"2025-01-14T02:10:46Z","published":"2024-11-24T09:12:37Z","title":"MambaTrack: Exploiting Dual-Enhancement for Night UAV Tracking","summary":" Night unmanned aerial vehicle (UAV) tracking is impeded by the challenges of\npoor illumination, with previous daylight-optimized methods demonstrating\nsuboptimal performance in low-light conditions, limiting the utility of UAV\napplications. To this end, we propose an efficient mamba-based tracker,\nleveraging dual enhancement techniques to boost night UAV tracking. The\nmamba-based low-light enhancer, equipped with an illumination estimator and a\ndamage restorer, achieves global image enhancement while preserving the details\nand structure of low-light images. Additionally, we advance a cross-modal mamba\nnetwork to achieve efficient interactive learning between vision and language\nmodalities. Extensive experiments showcase that our method achieves advanced\nperformance and exhibits significantly improved computation and memory\nefficiency. For instance, our method is 2.8$\\times$ faster than CiteTracker and\nreduces 50.2$\\%$ GPU memory. Our codes are available at\n\\url{https://github.com/983632847/Awesome-Multimodal-Object-Tracking}.\n","authors":["Chunhui Zhang","Li Liu","Hao Wen","Xi Zhou","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2411.15761v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2405.14880v4","updated":"2025-01-14T01:57:44Z","published":"2024-04-04T20:06:07Z","title":"Dissecting Query-Key Interaction in Vision Transformers","summary":" Self-attention in vision transformers is often thought to perform perceptual\ngrouping where tokens attend to other tokens with similar embeddings, which\ncould correspond to semantically similar features of an object. However,\nattending to dissimilar tokens can be beneficial by providing contextual\ninformation. We propose to analyze the query-key interaction by the singular\nvalue decomposition of the interaction matrix (i.e.\n${\\textbf{W}_q}^\\top\\textbf{W}_k$). We find that in many ViTs, especially those\nwith classification training objectives, early layers attend more to similar\ntokens, while late layers show increased attention to dissimilar tokens,\nproviding evidence corresponding to perceptual grouping and contextualization,\nrespectively. Many of these interactions between features represented by\nsingular vectors are interpretable and semantic, such as attention between\nrelevant objects, between parts of an object, or between the foreground and\nbackground. This offers a novel perspective on interpreting the attention\nmechanism, which contributes to understanding how transformer models utilize\ncontext and salient features when processing images.\n","authors":["Xu Pan","Aaron Philip","Ziqian Xie","Odelia Schwartz"],"pdf_url":"https://arxiv.org/pdf/2405.14880v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07783v1","updated":"2025-01-14T01:57:41Z","published":"2025-01-14T01:57:41Z","title":"Parameter-Inverted Image Pyramid Networks for Visual Perception and\n Multimodal Understanding","summary":" Image pyramids are widely adopted in top-performing methods to obtain\nmulti-scale features for precise visual perception and understanding. However,\ncurrent image pyramids use the same large-scale model to process multiple\nresolutions of images, leading to significant computational cost. To address\nthis challenge, we propose a novel network architecture, called\nParameter-Inverted Image Pyramid Networks (PIIP). Specifically, PIIP uses\npretrained models (ViTs or CNNs) as branches to process multi-scale images,\nwhere images of higher resolutions are processed by smaller network branches to\nbalance computational cost and performance. To integrate information from\ndifferent spatial scales, we further propose a novel cross-branch feature\ninteraction mechanism. To validate PIIP, we apply it to various perception\nmodels and a representative multimodal large language model called LLaVA, and\nconduct extensive experiments on various tasks such as object detection,\nsegmentation, image classification and multimodal understanding. PIIP achieves\nsuperior performance compared to single-branch and existing multi-resolution\napproaches with lower computational cost. When applied to InternViT-6B, a\nlarge-scale vision foundation model, PIIP can improve its performance by 1%-2%\non detection and segmentation with only 40%-60% of the original computation,\nfinally achieving 60.0 box AP on MS COCO and 59.7 mIoU on ADE20K. For\nmultimodal understanding, our PIIP-LLaVA achieves 73.0% accuracy on TextVQA and\n74.5% on MMBench with only 2.8M training data. Our code is released at\nhttps://github.com/OpenGVLab/PIIP.\n","authors":["Zhaokai Wang","Xizhou Zhu","Xue Yang","Gen Luo","Hao Li","Changyao Tian","Wenhan Dou","Junqi Ge","Lewei Lu","Yu Qiao","Jifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2501.07783v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12463v2","updated":"2025-01-14T01:57:04Z","published":"2024-08-22T15:04:59Z","title":"Smartphone-based Eye Tracking System using Edge Intelligence and Model\n Optimisation","summary":" A significant limitation of current smartphone-based eye-tracking algorithms\nis their low accuracy when applied to video-type visual stimuli, as they are\ntypically trained on static images. Also, the increasing demand for real-time\ninteractive applications like games, VR, and AR on smartphones requires\novercoming the limitations posed by resource constraints such as limited\ncomputational power, battery life, and network bandwidth. Therefore, we\ndeveloped two new smartphone eye-tracking techniques for video-type visuals by\ncombining Convolutional Neural Networks (CNN) with two different Recurrent\nNeural Networks (RNN), namely Long Short Term Memory (LSTM) and Gated Recurrent\nUnit (GRU). Our CNN+LSTM and CNN+GRU models achieved an average Root Mean\nSquare Error of 0.955 cm and 1.091 cm, respectively. To address the\ncomputational constraints of smartphones, we developed an edge intelligence\narchitecture to enhance the performance of smartphone-based eye tracking. We\napplied various optimisation methods like quantisation and pruning to deep\nlearning models for better energy, CPU, and memory usage on edge devices,\nfocusing on real-time processing. Using model quantisation, the model inference\ntime in the CNN+LSTM and CNN+GRU models was reduced by 21.72% and 19.50%,\nrespectively, on edge devices.\n","authors":["Nishan Gunawardena","Gough Yumu Lui","Jeewani Anupama Ginige","Bahman Javadi"],"pdf_url":"https://arxiv.org/pdf/2408.12463v2.pdf","comment":"I have included the three papers as reference, which are closely\n related. We have expanded the future work section to provide a more thorough\n discussion of the concepts of \"varying lighting conditions\" and \"dynamic user\n environments.\" We have added a note below Table 4 to clarify the\n abbreviations' meaning. Elaborated the role of the Domain Expert within the\n presentation layer in Section 4.1"},{"id":"http://arxiv.org/abs/2407.14649v2","updated":"2025-01-14T01:39:22Z","published":"2024-07-19T19:56:53Z","title":"The Collection of a Human Robot Collaboration Dataset for Cooperative\n Assembly in Glovebox Environments","summary":" Industry 4.0 introduced AI as a transformative solution for modernizing\nmanufacturing processes. Its successor, Industry 5.0, envisions humans as\ncollaborators and experts guiding these AI-driven manufacturing solutions.\nDeveloping these techniques necessitates algorithms capable of safe, real-time\nidentification of human positions in a scene, particularly their hands, during\ncollaborative assembly. Although substantial efforts have curated datasets for\nhand segmentation, most focus on residential or commercial domains. Existing\ndatasets targeting industrial settings predominantly rely on synthetic data,\nwhich we demonstrate does not effectively transfer to real-world operations.\nMoreover, these datasets lack uncertainty estimations critical for safe\ncollaboration. Addressing these gaps, we present HAGS: Hand and Glove\nSegmentation Dataset. This dataset provides challenging examples to build\napplications toward hand and glove segmentation in industrial human-robot\ncollaboration scenarios as well as assess out-of-distribution images,\nconstructed via green screen augmentations, to determine ML-classifier\nrobustness. We study state-of-the-art, real-time segmentation models to\nevaluate existing methods. Our dataset and baselines are publicly available.\n","authors":["Shivansh Sharma","Mathew Huang","Sanat Nair","Alan Wen","Christina Petlowany","Juston Moore","Selma Wanna","Mitch Pryor"],"pdf_url":"https://arxiv.org/pdf/2407.14649v2.pdf","comment":"draft paper to be submitted to IJRR"},{"id":"http://arxiv.org/abs/2210.01272v3","updated":"2025-01-14T01:34:10Z","published":"2022-10-03T23:44:38Z","title":"A systematic review of the use of Deep Learning in Satellite Imagery for\n Agriculture","summary":" Agricultural research is essential for increasing food production to meet the\nrequirements of an increasing population in the coming decades. Recently,\nsatellite technology has been improving rapidly and deep learning has seen much\nsuccess in generic computer vision tasks and many application areas which\npresents an important opportunity to improve analysis of agricultural land.\nHere we present a systematic review of 150 studies to find the current uses of\ndeep learning on satellite imagery for agricultural research. Although we\nidentify 5 categories of agricultural monitoring tasks, the majority of the\nresearch interest is in crop segmentation and yield prediction. We found that,\nwhen used, modern deep learning methods consistently outperformed traditional\nmachine learning across most tasks; the only exception was that Long Short-Term\nMemory (LSTM) Recurrent Neural Networks did not consistently outperform Random\nForests (RF) for yield prediction. The reviewed studies have largely adopted\nmethodologies from generic computer vision, except for one major omission:\nbenchmark datasets are not utilised to evaluate models across studies, making\nit difficult to compare results. Additionally, some studies have specifically\nutilised the extra spectral resolution available in satellite imagery, but\nother divergent properties of satellite images - such as the hugely different\nscales of spatial patterns - are not being taken advantage of in the reviewed\nstudies.\n","authors":["Brandon Victor","Zhen He","Aiden Nibali"],"pdf_url":"https://arxiv.org/pdf/2210.01272v3.pdf","comment":"23 pages, 5 figures and 10 tables in main paper. Final version, as\n submitted and accepted at JSTARS"},{"id":"http://arxiv.org/abs/2408.06170v4","updated":"2025-01-14T01:27:36Z","published":"2024-08-12T14:16:10Z","title":"Zero-shot 3D Segmentation of Abdominal Organs in CT Scans Using Segment\n Anything Model 2: Adapting Video Tracking Capabilities for 3D Medical Imaging","summary":" Objectives: To evaluate the zero-shot performance of Segment Anything Model 2\n(SAM 2) in 3D segmentation of abdominal organs in CT scans, and to investigate\nthe effects of prompt settings on segmentation results.\n Materials and Methods: In this retrospective study, we used a subset of the\nTotalSegmentator CT dataset from eight institutions to assess SAM 2's ability\nto segment eight abdominal organs. Segmentation was initiated from three\ndifferent z-coordinate levels (caudal, mid, and cranial levels) of each organ.\nPerformance was measured using the Dice similarity coefficient (DSC). We also\nanalyzed the impact of \"negative prompts,\" which explicitly exclude certain\nregions from the segmentation process, on accuracy.\n Results: 123 patients (mean age, 60.7 \\pm 15.5 years; 63 men, 60 women) were\nevaluated. As a zero-shot approach, larger organs with clear boundaries\ndemonstrated high segmentation performance, with mean DSCs as follows: liver\n0.821 \\pm 0.192, right kidney 0.862 \\pm 0.212, left kidney 0.870 \\pm 0.154, and\nspleen 0.891 \\pm 0.131. Smaller organs showed lower performance: gallbladder\n0.531 \\pm 0.291, pancreas 0.361 \\pm 0.197, and adrenal glands, right 0.203 \\pm\n0.222, left 0.308 \\pm 0.234. The initial slice for segmentation and the use of\nnegative prompts significantly influenced the results. By removing negative\nprompts from the input, the DSCs significantly decreased for six organs.\n Conclusion: SAM 2 demonstrated promising zero-shot performance in segmenting\ncertain abdominal organs in CT scans, particularly larger organs. Performance\nwas significantly influenced by input negative prompts and initial slice\nselection, highlighting the importance of optimizing these factors.\n","authors":["Yosuke Yamagishi","Shouhei Hanaoka","Tomohiro Kikuchi","Takahiro Nakao","Yuta Nakamura","Yukihiro Nomura","Soichiro Miki","Takeharu Yoshikawa","Osamu Abe"],"pdf_url":"https://arxiv.org/pdf/2408.06170v4.pdf","comment":"20 pages, 7 figures (including 2 supplemental figure), 4 tables"},{"id":"http://arxiv.org/abs/2306.03983v2","updated":"2025-01-14T01:10:52Z","published":"2023-06-06T19:36:11Z","title":"XVertNet: Unsupervised Contrast Enhancement of Vertebral Structures with\n Dynamic Self-Tuning Guidance and Multi-Stage Analysis","summary":" Chest X-rays remain the primary diagnostic tool in emergency medicine, yet\ntheir limited ability to capture fine anatomical details can result in missed\nor delayed diagnoses. To address this, we introduce XVertNet, a novel\ndeep-learning framework designed to enhance vertebral structure visualization\nin X-ray images significantly. Our framework introduces two key innovations:\n(1) An unsupervised learning architecture that eliminates reliance on manually\nlabeled training data a persistent bottleneck in medical imaging, and (2) a\ndynamic self-tuned internal guidance mechanism featuring an adaptive feedback\nloop for real-time image optimization. Extensive validation across four major\npublic datasets revealed that XVertNet outperforms state-of-the-art enhancement\nmethods, as demonstrated by improvements in entropy scores, Tenengrad criterion\nvalues, the local phase coherence sharpness index (LPC-SI), and thetone mapped\nimage quality index (TMQI). Furthermore, clinical validation conducted with two\nboard-certified radiologists confirmed that the enhanced images enabled more\nsensitive detection of subtle vertebral fractures and degenerative changes. The\nunsupervised nature of XVertNet facilitates immediate clinical deployment\nwithout requiring additional training overhead. This innovation represents a\ntransformative advancement in emergency radiology, providing a scalable and\ntime-efficient solution to enhance diagnostic accuracy in high-pressure\nclinical environments.\n","authors":["Ella Eidlin","Assaf Hoogi","Hila Rozen","Mohammad Badarne","Nathan S. Netanyahu"],"pdf_url":"https://arxiv.org/pdf/2306.03983v2.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2501.07769v1","updated":"2025-01-14T00:59:55Z","published":"2025-01-14T00:59:55Z","title":"BMIP: Bi-directional Modality Interaction Prompt Learning for VLM","summary":" Vision-language models (VLMs) have exhibited remarkable generalization\ncapabilities, and prompt learning for VLMs has attracted great attention for\nthe ability to adapt pre-trained VLMs to specific downstream tasks. However,\nexisting studies mainly focus on single-modal prompts or uni-directional\nmodality interaction, overlooking the powerful alignment effects resulting from\nthe interaction between the vision and language modalities. To this end, we\npropose a novel prompt learning method called\n$\\underline{\\textbf{B}}i-directional \\underline{\\textbf{M}}odality\n\\underline{\\textbf{I}}nteraction \\underline{\\textbf{P}}rompt (BMIP)$, which\ndynamically weights bi-modal information through learning the information of\nthe attention layer, enhancing trainability and inter-modal consistency\ncompared to simple information aggregation methods. To evaluate the\neffectiveness of prompt learning methods, we propose a more realistic\nevaluation paradigm called open-world generalization complementing the widely\nadopted cross-dataset transfer and domain generalization tasks. Comprehensive\nexperiments on various datasets reveal that BMIP not only outperforms current\nstate-of-the-art methods across all three evaluation paradigms but is also\nflexible enough to be combined with other prompt-based methods for consistent\nperformance enhancement.\n","authors":["Song-Lin Lv","Yu-Yang Chen","Zhi Zhou","Ming Yang","Lan-Zhe Guo"],"pdf_url":"https://arxiv.org/pdf/2501.07769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07762v1","updated":"2025-01-14T00:30:22Z","published":"2025-01-14T00:30:22Z","title":"PSReg: Prior-guided Sparse Mixture of Experts for Point Cloud\n Registration","summary":" The discriminative feature is crucial for point cloud registration. Recent\nmethods improve the feature discriminative by distinguishing between\nnon-overlapping and overlapping region points. However, they still face\nchallenges in distinguishing the ambiguous structures in the overlapping\nregions. Therefore, the ambiguous features they extracted resulted in a\nsignificant number of outlier matches from overlapping regions. To solve this\nproblem, we propose a prior-guided SMoE-based registration method to improve\nthe feature distinctiveness by dispatching the potential correspondences to the\nsame experts. Specifically, we propose a prior-guided SMoE module by fusing\nprior overlap and potential correspondence embeddings for routing, assigning\ntokens to the most suitable experts for processing. In addition, we propose a\nregistration framework by a specific combination of Transformer layer and\nprior-guided SMoE module. The proposed method not only pays attention to the\nimportance of locating the overlapping areas of point clouds, but also commits\nto finding more accurate correspondences in overlapping areas. Our extensive\nexperiments demonstrate the effectiveness of our method, achieving\nstate-of-the-art registration recall (95.7\\%/79.3\\%) on the 3DMatch/3DLoMatch\nbenchmark. Moreover, we also test the performance on ModelNet40 and demonstrate\nexcellent performance.\n","authors":["Xiaoshui Huang","Zhou Huang","Yifan Zuo","Yongshun Gong","Chengdong Zhang","Deyang Liu","Yuming Fang"],"pdf_url":"https://arxiv.org/pdf/2501.07762v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.08495v1","updated":"2025-01-14T23:57:35Z","published":"2025-01-14T23:57:35Z","title":"Automotive Elevation Mapping with Interferometric Synthetic Aperture\n Radar","summary":" Radar is a low-cost and ubiquitous automotive sensor, but is limited by array\nresolution and sensitivity when performing direction of arrival analysis.\nSynthetic Aperture Radar (SAR) is a class of techniques to improve azimuth\nresolution and sensitivity for radar. Interferometric SAR (InSAR) can be used\nto extract elevation from the variations in phase measurements in SAR images.\nUtilizing InSAR we show that a typical, low-resolution radar array mounted on a\nvehicle can be used to accurately localize detections in 3D space for both\nurban and agricultural environments. We generate point clouds in each\nenvironment by combining InSAR with a signal processing scheme tailored to\nautomotive driving. This low-compute approach allows radar to be used as a\nprimary sensor to map fine details in complex driving environments, and be used\nto make autonomous perception decisions.\n","authors":["Leyla A. Kabuli","Griffin Foster"],"pdf_url":"https://arxiv.org/pdf/2501.08495v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.08490v1","updated":"2025-01-14T23:31:20Z","published":"2025-01-14T23:31:20Z","title":"FLAVARS: A Multimodal Foundational Language and Vision Alignment Model\n for Remote Sensing","summary":" Remote sensing imagery is dense with objects and contextual visual\ninformation. There is a recent trend to combine paired satellite images and\ntext captions for pretraining performant encoders for downstream tasks.\nHowever, while contrastive image-text methods like CLIP enable vision-language\nalignment and zero-shot classification ability, vision-only downstream\nperformance tends to degrade compared to image-only pretraining, such as MAE.\nIn this paper, we propose FLAVARS, a pretraining method that combines the best\nof both contrastive learning and masked modeling, along with geospatial\nalignment via contrastive location encoding. We find that FLAVARS significantly\noutperforms a baseline of SkyCLIP for vision-only tasks such as KNN\nclassification and semantic segmentation, +6\\% mIOU on SpaceNet1, while\nretaining the ability to perform zero-shot classification, unlike MAE\npretrained methods.\n","authors":["Isaac Corley","Simone Fobi Nsutezo","Anthony Ortiz","Caleb Robinson","Rahul Dodhia","Juan M. Lavista Ferres","Peyman Najafirad"],"pdf_url":"https://arxiv.org/pdf/2501.08490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08471v1","updated":"2025-01-14T22:36:11Z","published":"2025-01-14T22:36:11Z","title":"Benchmarking Classical, Deep, and Generative Models for Human Activity\n Recognition","summary":" Human Activity Recognition (HAR) has gained significant importance with the\ngrowing use of sensor-equipped devices and large datasets. This paper evaluates\nthe performance of three categories of models : classical machine learning,\ndeep learning architectures, and Restricted Boltzmann Machines (RBMs) using\nfive key benchmark datasets of HAR (UCI-HAR, OPPORTUNITY, PAMAP2, WISDM, and\nBerkeley MHAD). We assess various models, including Decision Trees, Random\nForests, Convolutional Neural Networks (CNN), and Deep Belief Networks (DBNs),\nusing metrics such as accuracy, precision, recall, and F1-score for a\ncomprehensive comparison. The results show that CNN models offer superior\nperformance across all datasets, especially on the Berkeley MHAD. Classical\nmodels like Random Forest do well on smaller datasets but face challenges with\nlarger, more complex data. RBM-based models also show notable potential,\nparticularly for feature learning. This paper offers a detailed comparison to\nhelp researchers choose the most suitable model for HAR tasks.\n","authors":["Md Meem Hossain","The Anh Han","Safina Showkat Ara","Zia Ush Shamszaman"],"pdf_url":"https://arxiv.org/pdf/2501.08471v1.pdf","comment":"48 pages, 21 Figures"},{"id":"http://arxiv.org/abs/2501.08470v1","updated":"2025-01-14T22:33:07Z","published":"2025-01-14T22:33:07Z","title":"Detecting Contextual Anomalies by Discovering Consistent Spatial Regions","summary":" We describe a method for modeling spatial context to enable video anomaly\ndetection. The main idea is to discover regions that share similar object-level\nactivities by clustering joint object attributes using Gaussian mixture models.\nWe demonstrate that this straightforward approach, using orders of magnitude\nfewer parameters than competing models, achieves state-of-the-art performance\nin the challenging spatial-context-dependent Street Scene dataset. As a side\nbenefit, the high-resolution discovered regions learned by the model also\nprovide explainable normalcy maps for human operators without the need for any\npre-trained segmentation model.\n","authors":["Zhengye Yang","Richard J. Radke"],"pdf_url":"https://arxiv.org/pdf/2501.08470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06720v4","updated":"2025-01-14T22:30:10Z","published":"2023-04-13T17:59:55Z","title":"Expressive Text-to-Image Generation with Rich Text","summary":" Plain text has become a prevalent interface for text-to-image synthesis.\nHowever, its limited customization options hinder users from accurately\ndescribing desired outputs. For example, plain text makes it hard to specify\ncontinuous quantities, such as the precise RGB color value or importance of\neach word. Furthermore, creating detailed text prompts for complex scenes is\ntedious for humans to write and challenging for text encoders to interpret. To\naddress these challenges, we propose using a rich-text editor supporting\nformats such as font style, size, color, and footnote. We extract each word's\nattributes from rich text to enable local style control, explicit token\nreweighting, precise color rendering, and detailed region synthesis. We achieve\nthese capabilities through a region-based diffusion process. We first obtain\neach word's region based on attention maps of a diffusion process using plain\ntext. For each region, we enforce its text attributes by creating\nregion-specific detailed prompts and applying region-specific guidance, and\nmaintain its fidelity against plain-text generation through region-based\ninjections. We present various examples of image generation from rich text and\ndemonstrate that our method outperforms strong baselines with quantitative\nevaluations.\n","authors":["Songwei Ge","Taesung Park","Jun-Yan Zhu","Jia-Bin Huang"],"pdf_url":"https://arxiv.org/pdf/2304.06720v4.pdf","comment":"Project webpage: https://rich-text-to-image.github.io/"},{"id":"http://arxiv.org/abs/2501.08465v1","updated":"2025-01-14T22:23:11Z","published":"2025-01-14T22:23:11Z","title":"Predicting Performance of Object Detection Models in Electron Microscopy\n Using Random Forests","summary":" Quantifying prediction uncertainty when applying object detection models to\nnew, unlabeled datasets is critical in applied machine learning. This study\nintroduces an approach to estimate the performance of deep learning-based\nobject detection models for quantifying defects in transmission electron\nmicroscopy (TEM) images, focusing on detecting irradiation-induced cavities in\nTEM images of metal alloys. We developed a random forest regression model that\npredicts the object detection F1 score, a statistical metric used to evaluate\nthe ability to accurately locate and classify objects of interest. The random\nforest model uses features extracted from the predictions of the object\ndetection model whose uncertainty is being quantified, enabling fast prediction\non new, unlabeled images. The mean absolute error (MAE) for predicting F1 of\nthe trained model on test data is 0.09, and the $R^2$ score is 0.77, indicating\nthere is a significant correlation between the random forest regression model\npredicted and true defect detection F1 scores. The approach is shown to be\nrobust across three distinct TEM image datasets with varying imaging and\nmaterial domains. Our approach enables users to estimate the reliability of a\ndefect detection and segmentation model predictions and assess the\napplicability of the model to their specific datasets, providing valuable\ninformation about possible domain shifts and whether the model needs to be\nfine-tuned or trained on additional data to be maximally effective for the\ndesired use case.\n","authors":["Ni Li","Ryan Jacobs","Matthew Lynch","Vidit Agrawal","Kevin Field","Dane Morgan"],"pdf_url":"https://arxiv.org/pdf/2501.08465v1.pdf","comment":"14 pages, 9 figures, 3 tables"},{"id":"http://arxiv.org/abs/2501.08460v1","updated":"2025-01-14T22:09:06Z","published":"2025-01-14T22:09:06Z","title":"Towards Zero-Shot & Explainable Video Description by Reasoning over\n Graphs of Events in Space and Time","summary":" In the current era of Machine Learning, Transformers have become the de facto\napproach across a variety of domains, such as computer vision and natural\nlanguage processing. Transformer-based solutions are the backbone of current\nstate-of-the-art methods for language generation, image and video\nclassification, segmentation, action and object recognition, among many others.\nInterestingly enough, while these state-of-the-art methods produce impressive\nresults in their respective domains, the problem of understanding the\nrelationship between vision and language is still beyond our reach. In this\nwork, we propose a common ground between vision and language based on events in\nspace and time in an explainable and programmatic way, to connect\nlearning-based vision and language state of the art models and provide a\nsolution to the long standing problem of describing videos in natural language.\nWe validate that our algorithmic approach is able to generate coherent, rich\nand relevant textual descriptions on videos collected from a variety of\ndatasets, using both standard metrics (e.g. Bleu, ROUGE) and the modern\nLLM-as-a-Jury approach.\n","authors":["Mihai Masala","Marius Leordeanu"],"pdf_url":"https://arxiv.org/pdf/2501.08460v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09349v4","updated":"2025-01-14T22:05:06Z","published":"2023-06-15T17:59:59Z","title":"UrbanIR: Large-Scale Urban Scene Inverse Rendering from a Single Video","summary":" We present UrbanIR (Urban Scene Inverse Rendering), a new inverse graphics\nmodel that enables realistic, free-viewpoint renderings of scenes under various\nlighting conditions with a single video. It accurately infers shape, albedo,\nvisibility, and sun and sky illumination from wide-baseline videos, such as\nthose from car-mounted cameras, differing from NeRF's dense view settings. In\nthis context, standard methods often yield subpar geometry and material\nestimates, such as inaccurate roof representations and numerous 'floaters'.\nUrbanIR addresses these issues with novel losses that reduce errors in inverse\ngraphics inference and rendering artifacts. Its techniques allow for precise\nshadow volume estimation in the original scene. The model's outputs support\ncontrollable editing, enabling photorealistic free-viewpoint renderings of\nnight simulations, relit scenes, and inserted objects, marking a significant\nimprovement over existing state-of-the-art methods.\n","authors":["Chih-Hao Lin","Bohan Liu","Yi-Ting Chen","Kuan-Sheng Chen","David Forsyth","Jia-Bin Huang","Anand Bhattad","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2306.09349v4.pdf","comment":"https://urbaninverserendering.github.io/"},{"id":"http://arxiv.org/abs/2501.08458v1","updated":"2025-01-14T22:03:00Z","published":"2025-01-14T22:03:00Z","title":"RWKV-UNet: Improving UNet with Long-Range Cooperation for Effective\n Medical Image Segmentation","summary":" In recent years, there have been significant advancements in deep learning\nfor medical image analysis, especially with convolutional neural networks\n(CNNs) and transformer models. However, CNNs face limitations in capturing\nlong-range dependencies while transformers suffer high computational\ncomplexities. To address this, we propose RWKV-UNet, a novel model that\nintegrates the RWKV (Receptance Weighted Key Value) structure into the U-Net\narchitecture. This integration enhances the model's ability to capture\nlong-range dependencies and improve contextual understanding, which is crucial\nfor accurate medical image segmentation. We build a strong encoder with\ndeveloped inverted residual RWKV (IR-RWKV) blocks combining CNNs and RWKVs. We\nalso propose a Cross-Channel Mix (CCM) module to improve skip connections with\nmulti-scale feature fusion, achieving global channel information integration.\nExperiments on benchmark datasets, including Synapse, ACDC, BUSI, CVC-ClinicDB,\nCVC-ColonDB, Kvasir-SEG, ISIC 2017 and GLAS show that RWKV-UNet achieves\nstate-of-the-art performance on various types of medical image segmentation.\nAdditionally, smaller variants, RWKV-UNet-S and RWKV-UNet-T, balance accuracy\nand computational efficiency, making them suitable for broader clinical\napplications.\n","authors":["Juntao Jiang","Jiangning Zhang","Weixuan Liu","Muxuan Gao","Xiaobin Hu","Xiaoxiao Yan","Feiyue Huang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08453v1","updated":"2025-01-14T21:53:11Z","published":"2025-01-14T21:53:11Z","title":"Vchitect-2.0: Parallel Transformer for Scaling Up Video Diffusion Models","summary":" We present Vchitect-2.0, a parallel transformer architecture designed to\nscale up video diffusion models for large-scale text-to-video generation. The\noverall Vchitect-2.0 system has several key designs. (1) By introducing a novel\nMultimodal Diffusion Block, our approach achieves consistent alignment between\ntext descriptions and generated video frames, while maintaining temporal\ncoherence across sequences. (2) To overcome memory and computational\nbottlenecks, we propose a Memory-efficient Training framework that incorporates\nhybrid parallelism and other memory reduction techniques, enabling efficient\ntraining of long video sequences on distributed systems. (3) Additionally, our\nenhanced data processing pipeline ensures the creation of Vchitect T2V\nDataVerse, a high-quality million-scale training dataset through rigorous\nannotation and aesthetic evaluation. Extensive benchmarking demonstrates that\nVchitect-2.0 outperforms existing methods in video quality, training\nefficiency, and scalability, serving as a suitable base for high-fidelity video\ngeneration.\n","authors":["Weichen Fan","Chenyang Si","Junhao Song","Zhenyu Yang","Yinan He","Long Zhuo","Ziqi Huang","Ziyue Dong","Jingwen He","Dongwei Pan","Yi Wang","Yuming Jiang","Yaohui Wang","Peng Gao","Xinyuan Chen","Hengjie Li","Dahua Lin","Yu Qiao","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16689v3","updated":"2025-01-14T21:37:31Z","published":"2024-03-25T12:23:39Z","title":"SYNAPSE: SYmbolic Neural-Aided Preference Synthesis Engine","summary":" This paper addresses the problem of preference learning, which aims to align\nrobot behaviors through learning user specific preferences (e.g. \"good\npull-over location\") from visual demonstrations. Despite its similarity to\nlearning factual concepts (e.g. \"red door\"), preference learning is a\nfundamentally harder problem due to its subjective nature and the paucity of\nperson-specific training data. We address this problem using a novel framework\ncalled SYNAPSE, which is a neuro-symbolic approach designed to efficiently\nlearn preferential concepts from limited data. SYNAPSE represents preferences\nas neuro-symbolic programs, facilitating inspection of individual parts for\nalignment, in a domain-specific language (DSL) that operates over images and\nleverages a novel combination of visual parsing, large language models, and\nprogram synthesis to learn programs representing individual preferences. We\nperform extensive evaluations on various preferential concepts as well as user\ncase studies demonstrating its ability to align well with dissimilar user\npreferences. Our method significantly outperforms baselines, especially when it\ncomes to out of distribution generalization. We show the importance of the\ndesign choices in the framework through multiple ablation studies. Code,\nadditional results, and supplementary material can be found on the website:\nhttps://amrl.cs.utexas.edu/synapse\n","authors":["Sadanand Modak","Noah Patton","Isil Dillig","Joydeep Biswas"],"pdf_url":"https://arxiv.org/pdf/2403.16689v3.pdf","comment":"Accepted (oral) at AAAI 25"},{"id":"http://arxiv.org/abs/2501.08446v1","updated":"2025-01-14T21:34:34Z","published":"2025-01-14T21:34:34Z","title":"Poseidon: A ViT-based Architecture for Multi-Frame Pose Estimation with\n Adaptive Frame Weighting and Multi-Scale Feature Fusion","summary":" Human pose estimation, a vital task in computer vision, involves detecting\nand localising human joints in images and videos. While single-frame pose\nestimation has seen significant progress, it often fails to capture the\ntemporal dynamics for understanding complex, continuous movements. We propose\nPoseidon, a novel multi-frame pose estimation architecture that extends the\nViTPose model by integrating temporal information for enhanced accuracy and\nrobustness to address these limitations. Poseidon introduces key innovations:\n(1) an Adaptive Frame Weighting (AFW) mechanism that dynamically prioritises\nframes based on their relevance, ensuring that the model focuses on the most\ninformative data; (2) a Multi-Scale Feature Fusion (MSFF) module that\naggregates features from different backbone layers to capture both fine-grained\ndetails and high-level semantics; and (3) a Cross-Attention module for\neffective information exchange between central and contextual frames, enhancing\nthe model's temporal coherence. The proposed architecture improves performance\nin complex video scenarios and offers scalability and computational efficiency\nsuitable for real-world applications. Our approach achieves state-of-the-art\nperformance on the PoseTrack21 and PoseTrack18 datasets, achieving mAP scores\nof 88.3 and 87.8, respectively, outperforming existing methods.\n","authors":["Cesare Davide Pace","Alessandro Marco De Nunzio","Claudio De Stefano","Francesco Fontanella","Mario Molinara"],"pdf_url":"https://arxiv.org/pdf/2501.08446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.07298v3","updated":"2025-01-14T21:26:13Z","published":"2024-10-09T17:07:34Z","title":"Enhancing Performance of Point Cloud Completion Networks with\n Consistency Loss","summary":" Point cloud completion networks are conventionally trained to minimize the\ndisparities between the completed point cloud and the ground-truth counterpart.\nHowever, an incomplete object-level point cloud can have multiple valid\ncompletion solutions when it is examined in isolation. This one-to-many mapping\nissue can cause contradictory supervision signals to the network because the\nloss function may produce different values for identical input-output pairs of\nthe network. In many cases, this issue could adversely affect the network\noptimization process. In this work, we propose to enhance the conventional\nlearning objective using a novel completion consistency loss to mitigate the\none-to-many mapping problem. Specifically, the proposed consistency loss ensure\nthat a point cloud completion network generates a coherent completion solution\nfor incomplete objects originating from the same source point cloud.\nExperimental results across multiple well-established datasets and benchmarks\ndemonstrated the proposed completion consistency loss have excellent capability\nto enhance the completion performance of various existing networks without any\nmodification to the design of the networks. The proposed consistency loss\nenhances the performance of the point completion network without affecting the\ninference speed, thereby increasing the accuracy of point cloud completion.\nNotably, a state-of-the-art point completion network trained with the proposed\nconsistency loss can achieve state-of-the-art accuracy on the challenging new\nMVP dataset. The code and result of experiment various point completion models\nusing proposed consistency loss will be available at:\nhttps://github.com/kaist-avelab/ConsistencyLoss .\n","authors":["Kevin Tirta Wijaya","Christofel Rio Goenawan","Seung-Hyun Kong"],"pdf_url":"https://arxiv.org/pdf/2410.07298v3.pdf","comment":"First version of Paper \"Enhancing Performance of Point Cloud\n Completion Networks with Consistency Loss\" by Kevin Tirta Wijaya and\n Christofel Rio Goenawan. In process submission to Neurocomputing Journal 2024"},{"id":"http://arxiv.org/abs/2501.08440v1","updated":"2025-01-14T21:08:08Z","published":"2025-01-14T21:08:08Z","title":"FARE: A Deep Learning-Based Framework for Radar-based Face Recognition\n and Out-of-distribution Detection","summary":" In this work, we propose a novel pipeline for face recognition and\nout-of-distribution (OOD) detection using short-range FMCW radar. The proposed\nsystem utilizes Range-Doppler and micro Range-Doppler Images. The architecture\nfeatures a primary path (PP) responsible for the classification of\nin-distribution (ID) faces, complemented by intermediate paths (IPs) dedicated\nto OOD detection. The network is trained in two stages: first, the PP is\ntrained using triplet loss to optimize ID face classification. In the second\nstage, the PP is frozen, and the IPs-comprising simple linear autoencoder\nnetworks-are trained specifically for OOD detection. Using our dataset\ngenerated with a 60 GHz FMCW radar, our method achieves an ID classification\naccuracy of 99.30% and an OOD detection AUROC of 96.91%.\n","authors":["Sabri Mustafa Kahya","Boran Hamdi Sivrikaya","Muhammet Sami Yavuz","Eckehard Steinbach"],"pdf_url":"https://arxiv.org/pdf/2501.08440v1.pdf","comment":"Accepted at ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.07015v2","updated":"2025-01-14T21:02:31Z","published":"2025-01-13T02:28:13Z","title":"SplatMAP: Online Dense Monocular SLAM with 3D Gaussian Splatting","summary":" Achieving high-fidelity 3D reconstruction from monocular video remains\nchallenging due to the inherent limitations of traditional methods like\nStructure-from-Motion (SfM) and monocular SLAM in accurately capturing scene\ndetails. While differentiable rendering techniques such as Neural Radiance\nFields (NeRF) address some of these challenges, their high computational costs\nmake them unsuitable for real-time applications. Additionally, existing 3D\nGaussian Splatting (3DGS) methods often focus on photometric consistency,\nneglecting geometric accuracy and failing to exploit SLAM's dynamic depth and\npose updates for scene refinement. We propose a framework integrating dense\nSLAM with 3DGS for real-time, high-fidelity dense reconstruction. Our approach\nintroduces SLAM-Informed Adaptive Densification, which dynamically updates and\ndensifies the Gaussian model by leveraging dense point clouds from SLAM.\nAdditionally, we incorporate Geometry-Guided Optimization, which combines\nedge-aware geometric constraints and photometric consistency to jointly\noptimize the appearance and geometry of the 3DGS scene representation, enabling\ndetailed and accurate SLAM mapping reconstruction. Experiments on the Replica\nand TUM-RGBD datasets demonstrate the effectiveness of our approach, achieving\nstate-of-the-art results among monocular systems. Specifically, our method\nachieves a PSNR of 36.864, SSIM of 0.985, and LPIPS of 0.040 on Replica,\nrepresenting improvements of 10.7%, 6.4%, and 49.4%, respectively, over the\nprevious SOTA. On TUM-RGBD, our method outperforms the closest baseline by\n10.2%, 6.6%, and 34.7% in the same metrics. These results highlight the\npotential of our framework in bridging the gap between photometric and\ngeometric dense 3D scene representations, paving the way for practical and\nefficient monocular dense reconstruction.\n","authors":["Yue Hu","Rong Liu","Meida Chen","Peter Beerel","Andrew Feng"],"pdf_url":"https://arxiv.org/pdf/2501.07015v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08415v1","updated":"2025-01-14T20:12:09Z","published":"2025-01-14T20:12:09Z","title":"Cross-Modal Transferable Image-to-Video Attack on Video Quality Metrics","summary":" Recent studies have revealed that modern image and video quality assessment\n(IQA/VQA) metrics are vulnerable to adversarial attacks. An attacker can\nmanipulate a video through preprocessing to artificially increase its quality\nscore according to a certain metric, despite no actual improvement in visual\nquality. Most of the attacks studied in the literature are white-box attacks,\nwhile black-box attacks in the context of VQA have received less attention.\nMoreover, some research indicates a lack of transferability of adversarial\nexamples generated for one model to another when applied to VQA. In this paper,\nwe propose a cross-modal attack method, IC2VQA, aimed at exploring the\nvulnerabilities of modern VQA models. This approach is motivated by the\nobservation that the low-level feature spaces of images and videos are similar.\nWe investigate the transferability of adversarial perturbations across\ndifferent modalities; specifically, we analyze how adversarial perturbations\ngenerated on a white-box IQA model with an additional CLIP module can\neffectively target a VQA model. The addition of the CLIP module serves as a\nvaluable aid in increasing transferability, as the CLIP model is known for its\neffective capture of low-level semantics. Extensive experiments demonstrate\nthat IC2VQA achieves a high success rate in attacking three black-box VQA\nmodels. We compare our method with existing black-box attack strategies,\nhighlighting its superiority in terms of attack success within the same number\nof iterations and levels of attack strength. We believe that the proposed\nmethod will contribute to the deeper analysis of robust VQA metrics.\n","authors":["Georgii Gotin","Ekaterina Shumitskaya","Anastasia Antsiferova","Dmitriy Vatolin"],"pdf_url":"https://arxiv.org/pdf/2501.08415v1.pdf","comment":"Accepted for VISAPP 2025"},{"id":"http://arxiv.org/abs/2501.08411v1","updated":"2025-01-14T19:59:59Z","published":"2025-01-14T19:59:59Z","title":"BiDepth Multimodal Neural Network: Bidirectional Depth Deep Learning\n Arcitecture for Spatial-Temporal Prediction","summary":" Accurate prediction of spatial-temporal (ST) information in dynamic systems,\nsuch as urban mobility and weather patterns, is a crucial yet challenging\nproblem. The complexity stems from the intricate interplay between spatial\nproximity and temporal relevance, where both long-term trends and short-term\nfluctuations are present in convoluted patterns. Existing approaches, including\ntraditional statistical methods and conventional neural networks, may provide\ninaccurate results due to the lack of an effective mechanism that\nsimultaneously incorporates information at variable temporal depths while\nmaintaining spatial context, resulting in a trade-off between comprehensive\nlong-term historical analysis and responsiveness to short-term new information.\nTo bridge this gap, this paper proposes the BiDepth Multimodal Neural Network\n(BDMNN) with bidirectional depth modulation that enables a comprehensive\nunderstanding of both long-term seasonality and short-term fluctuations,\nadapting to the complex ST context. Case studies with real-world public data\ndemonstrate significant improvements in prediction accuracy, with a 12%\nreduction in Mean Squared Error for urban traffic prediction and a 15%\nimprovement in rain precipitation forecasting compared to state-of-the-art\nbenchmarks, without demanding extra computational resources.\n","authors":["Sina Ehsani","Fenglian Pan","Qingpei Hu","Jian Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08411v1.pdf","comment":"This paper has been submitted to Applied Intelligence for review"},{"id":"http://arxiv.org/abs/2501.08408v1","updated":"2025-01-14T19:56:43Z","published":"2025-01-14T19:56:43Z","title":"Leveraging 2D Masked Reconstruction for Domain Adaptation of 3D Pose\n Estimation","summary":" RGB-based 3D pose estimation methods have been successful with the\ndevelopment of deep learning and the emergence of high-quality 3D pose\ndatasets. However, most existing methods do not operate well for testing images\nwhose distribution is far from that of training data. However, most existing\nmethods do not operate well for testing images whose distribution is far from\nthat of training data. This problem might be alleviated by involving diverse\ndata during training, however it is non-trivial to collect such diverse data\nwith corresponding labels (i.e. 3D pose). In this paper, we introduced an\nunsupervised domain adaptation framework for 3D pose estimation that utilizes\nthe unlabeled data in addition to labeled data via masked image modeling (MIM)\nframework. Foreground-centric reconstruction and attention regularization are\nfurther proposed to increase the effectiveness of unlabeled data usage.\nExperiments are conducted on the various datasets in human and hand pose\nestimation tasks, especially using the cross-domain scenario. We demonstrated\nthe effectiveness of ours by achieving the state-of-the-art accuracy on all\ndatasets.\n","authors":["Hansoo Park","Chanwoo Kim","Jihyeon Kim","Hoseong Cho","Nhat Nguyen Bao Truong","Taehwan Kim","Seungryul Baek"],"pdf_url":"https://arxiv.org/pdf/2501.08408v1.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.04809v2","updated":"2025-01-14T19:42:28Z","published":"2024-08-09T01:40:12Z","title":"On the Geometry of Deep Learning","summary":" In this paper, we overview one promising avenue of progress at the\nmathematical foundation of deep learning: the connection between deep networks\nand function approximation by affine splines (continuous piecewise linear\nfunctions in multiple dimensions). In particular, we will overview work over\nthe past decade on understanding certain geometrical properties of a deep\nnetwork's affine spline mapping, in particular how it tessellates its input\nspace. As we will see, the affine spline connection and geometrical viewpoint\nprovide a powerful portal through which to view, analyze, and improve the inner\nworkings of a deep network.\n","authors":["Randall Balestriero","Ahmed Imtiaz Humayun","Richard Baraniuk"],"pdf_url":"https://arxiv.org/pdf/2408.04809v2.pdf","comment":"Accepted for publication at 'Notices of the American Mathematical\n Society'"},{"id":"http://arxiv.org/abs/2501.08370v1","updated":"2025-01-14T18:40:33Z","published":"2025-01-14T18:40:33Z","title":"3D Gaussian Splatting with Normal Information for Mesh Extraction and\n Improved Rendering","summary":" Differentiable 3D Gaussian splatting has emerged as an efficient and flexible\nrendering technique for representing complex scenes from a collection of 2D\nviews and enabling high-quality real-time novel-view synthesis. However, its\nreliance on photometric losses can lead to imprecisely reconstructed geometry\nand extracted meshes, especially in regions with high curvature or fine detail.\nWe propose a novel regularization method using the gradients of a signed\ndistance function estimated from the Gaussians, to improve the quality of\nrendering while also extracting a surface mesh. The regularizing normal\nsupervision facilitates better rendering and mesh reconstruction, which is\ncrucial for downstream applications in video generation, animation, AR-VR and\ngaming. We demonstrate the effectiveness of our approach on datasets such as\nMip-NeRF360, Tanks and Temples, and Deep-Blending. Our method scores higher on\nphotorealism metrics compared to other mesh extracting rendering methods\nwithout compromising mesh quality.\n","authors":["Meenakshi Krishnan","Liam Fowl","Ramani Duraiswami"],"pdf_url":"https://arxiv.org/pdf/2501.08370v1.pdf","comment":"ICASSP 2025: Workshop on Generative Data Augmentation for Real-World\n Signal Processing Applications"}]},"2025-01-15T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2403.16478v4","updated":"2025-01-15T17:58:48Z","published":"2024-03-25T07:04:24Z","title":"Real-World Evaluation of two Cooperative Intersection Management\n Approaches","summary":" Cooperative maneuver planning promises to significantly improve traffic\nefficiency at unsignalized intersections by leveraging connected automated\nvehicles. Previous works on this topic have been mostly developed for\ncompletely automated traffic in a simple simulated environment. In contrast,\nour previously introduced planning approaches are specifically designed to\nhandle real-world mixed traffic. The two methods are based on multi-scenario\nprediction and graph-based reinforcement learning, respectively. This is the\nfirst study to perform evaluations in a novel mixed traffic simulation\nframework as well as real-world drives with prototype connected automated\nvehicles in public traffic. The simulation features the same connected\nautomated driving software stack as deployed on one of the automated vehicles.\nOur quantitative evaluations show that cooperative maneuver planning achieves a\nsubstantial reduction in crossing times and the number of stops. In a realistic\nenvironment with few automated vehicles, there are noticeable efficiency gains\nwith only slightly increasing criticality metrics.\n","authors":["Marvin Klimke","Max Bastian Mertens","Benjamin Völz","Michael Buchholz"],"pdf_url":"https://arxiv.org/pdf/2403.16478v4.pdf","comment":"M. Klimke and M. B. Mertens are both first authors with equal\n contribution. 10 pages, 9 figures, 3 tables, submitted to IEEE Intelligent\n Transportation Systems Magazine"},{"id":"http://arxiv.org/abs/2411.00138v3","updated":"2025-01-15T17:44:41Z","published":"2024-10-31T18:37:22Z","title":"Learning Low-Dimensional Strain Models of Soft Robots by Looking at the\n Evolution of Their Shape with Application to Model-Based Control","summary":" Obtaining dynamic models of continuum soft robots is central to the analysis\nand control of soft robots, and researchers have devoted much attention to the\nchallenge of proposing both data-driven and first-principle solutions. Both\navenues have, however, shown their limitations; the former lacks structure and\nperforms poorly outside training data, while the latter requires significant\nsimplifications and extensive expert knowledge to be used in practice. This\npaper introduces a streamlined method for learning low-dimensional,\nphysics-based models that are both accurate and easy to interpret. We start\nwith an algorithm that uses image data (i.e., shape evolutions) to determine\nthe minimal necessary segments for describing a soft robot's movement.\nFollowing this, we apply a dynamic regression and strain sparsification\nalgorithm to identify relevant strains and define the model's dynamics. We\nvalidate our approach through simulations with various planar soft\nmanipulators, comparing its performance against other learning strategies,\nshowing that our models are both computationally efficient and 25x more\naccurate on out-of-training distribution inputs. Finally, we demonstrate that\nthanks to the capability of the method of generating physically compatible\nmodels, the learned models can be straightforwardly combined with model-based\ncontrol policies.\n","authors":["Ricardo Valadas","Maximilian Stölzle","Jingyue Liu","Cosimo Della Santina"],"pdf_url":"https://arxiv.org/pdf/2411.00138v3.pdf","comment":"8 pages, appearing in Proceedings of the 2025 IEEE 8th International\n Conference on Soft Robotics (RoboSoft)"},{"id":"http://arxiv.org/abs/2501.08946v1","updated":"2025-01-15T16:49:22Z","published":"2025-01-15T16:49:22Z","title":"Applying General Turn-taking Models to Conversational Human-Robot\n Interaction","summary":" Turn-taking is a fundamental aspect of conversation, but current Human-Robot\nInteraction (HRI) systems often rely on simplistic, silence-based models,\nleading to unnatural pauses and interruptions. This paper investigates, for the\nfirst time, the application of general turn-taking models, specifically TurnGPT\nand Voice Activity Projection (VAP), to improve conversational dynamics in HRI.\nThese models are trained on human-human dialogue data using self-supervised\nlearning objectives, without requiring domain-specific fine-tuning. We propose\nmethods for using these models in tandem to predict when a robot should begin\npreparing responses, take turns, and handle potential interruptions. We\nevaluated the proposed system in a within-subject study against a traditional\nbaseline system, using the Furhat robot with 39 adults in a conversational\nsetting, in combination with a large language model for autonomous response\ngeneration. The results show that participants significantly prefer the\nproposed system, and it significantly reduces response delays and\ninterruptions.\n","authors":["Gabriel Skantze","Bahar Irfan"],"pdf_url":"https://arxiv.org/pdf/2501.08946v1.pdf","comment":"Accepted at HRI 2025 (the IEEE/ACM International Conference on\n Human-Robot Interaction)"},{"id":"http://arxiv.org/abs/2501.08941v1","updated":"2025-01-15T16:44:35Z","published":"2025-01-15T16:44:35Z","title":"A Reinforcement Learning Approach to Quiet and Safe UAM Traffic\n Management","summary":" Urban air mobility (UAM) is a transformative system that operates various\nsmall aerial vehicles in urban environments to reshape urban transportation.\nHowever, integrating UAM into existing urban environments presents a variety of\ncomplex challenges. Recent analyses of UAM's operational constraints highlight\naircraft noise and system safety as key hurdles to UAM system implementation.\nFuture UAM air traffic management schemes must ensure that the system is both\nquiet and safe. We propose a multi-agent reinforcement learning approach to\nmanage UAM traffic, aiming at both vertical separation assurance and noise\nmitigation. Through extensive training, the reinforcement learning agent learns\nto balance the two primary objectives by employing altitude adjustments in a\nmulti-layer UAM network. The results reveal the tradeoffs among noise impact,\ntraffic congestion, and separation. Overall, our findings demonstrate the\npotential of reinforcement learning in mitigating UAM's noise impact while\nmaintaining safe separation using altitude adjustments\n","authors":["Surya Murthy","John-Paul Clarke","Ufuk Topcu","Zhenyu Gao"],"pdf_url":"https://arxiv.org/pdf/2501.08941v1.pdf","comment":"Paper presented at SciTech 2025"},{"id":"http://arxiv.org/abs/2501.08908v1","updated":"2025-01-15T16:18:13Z","published":"2025-01-15T16:18:13Z","title":"When Uncertainty Leads to Unsafety: Empirical Insights into the Role of\n Uncertainty in Unmanned Aerial Vehicle Safety","summary":" Despite the recent developments in obstacle avoidance and other safety\nfeatures, autonomous Unmanned Aerial Vehicles (UAVs) continue to face safety\nchallenges. No previous work investigated the relationship between the\nbehavioral uncertainty of a UAV and the unsafety of its flight. By quantifying\nuncertainty, it is possible to develop a predictor for unsafety, which acts as\na flight supervisor. We conducted a large-scale empirical investigation of\nsafety violations using PX4-Autopilot, an open-source UAV software platform.\nOur dataset of over 5,000 simulated flights, created to challenge obstacle\navoidance, allowed us to explore the relation between uncertain UAV decisions\nand safety violations: up to 89% of unsafe UAV states exhibit significant\ndecision uncertainty, and up to 74% of uncertain decisions lead to unsafe\nstates. Based on these findings, we implemented Superialist (Supervising\nAutonomous Aerial Vehicles), a runtime uncertainty detector based on\nautoencoders, the state-of-the-art technology for anomaly detection.\nSuperialist achieved high performance in detecting uncertain behaviors with up\nto 96% precision and 93% recall. Despite the observed performance degradation\nwhen using the same approach for predicting unsafety (up to 74% precision and\n87% recall), Superialist enabled early prediction of unsafe states up to 50\nseconds in advance.\n","authors":["Sajad Khatiri","Fatemeh Mohammadi Amin","Sebastiano Panichella","Paolo Tonella"],"pdf_url":"https://arxiv.org/pdf/2501.08908v1.pdf","comment":"36 pages"},{"id":"http://arxiv.org/abs/2501.08880v1","updated":"2025-01-15T15:51:06Z","published":"2025-01-15T15:51:06Z","title":"SLC$^2$-SLAM: Semantic-guided Loop Closure with Shared Latent Code for\n NeRF SLAM","summary":" Targeting the notorious cumulative drift errors in NeRF SLAM, we propose a\nSemantic-guided Loop Closure with Shared Latent Code, dubbed SLC$^2$-SLAM.\nEspecially, we argue that latent codes stored in many NeRF SLAM systems are not\nfully exploited, as they are only used for better reconstruction. In this\npaper, we propose a simple yet effective way to detect potential loops using\nthe same latent codes as local features. To further improve the loop detection\nperformance, we use the semantic information, which are also decoded from the\nsame latent codes to guide the aggregation of local features. Finally, with the\npotential loops detected, we close them with a graph optimization followed by\nbundle adjustment to refine both the estimated poses and the reconstructed\nscene. To evaluate the performance of our SLC$^2$-SLAM, we conduct extensive\nexperiments on Replica and ScanNet datasets. Our proposed semantic-guided loop\nclosure significantly outperforms the pre-trained NetVLAD and ORB combined with\nBag-of-Words, which are used in all the other NeRF SLAM with loop closure. As a\nresult, our SLC$^2$-SLAM also demonstrated better tracking and reconstruction\nperformance, especially in larger scenes with more loops, like ScanNet.\n","authors":["Yuhang Ming","Di Ma","Weichen Dai","Han Yang","Rui Fan","Guofeng Zhang","Wanzeng Kong"],"pdf_url":"https://arxiv.org/pdf/2501.08880v1.pdf","comment":"8 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2501.07317v3","updated":"2025-01-15T14:01:15Z","published":"2025-01-13T13:28:03Z","title":"Evaluation of Artificial Intelligence Methods for Lead Time Prediction\n in Non-Cycled Areas of Automotive Production","summary":" The present study examines the effectiveness of applying Artificial\nIntelligence methods in an automotive production environment to predict unknown\nlead times in a non-cycle-controlled production area. Data structures are\nanalyzed to identify contextual features and then preprocessed using one-hot\nencoding. Methods selection focuses on supervised machine learning techniques.\nIn supervised learning methods, regression and classification methods are\nevaluated. Continuous regression based on target size distribution is not\nfeasible. Classification methods analysis shows that Ensemble Learning and\nSupport Vector Machines are the most suitable. Preliminary study results\nindicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost\nyield the best results. After further testing and extensive hyperparameter\noptimization, the final method choice is the LightGBM algorithm. Depending on\nfeature availability and prediction interval granularity, relative prediction\naccuracies of up to 90% can be achieved. Further tests highlight the importance\nof periodic retraining of AI models to accurately represent complex production\nprocesses using the database. The research demonstrates that AI methods can be\neffectively applied to highly variable production data, adding business value\nby providing an additional metric for various control tasks while outperforming\ncurrent non AI-based systems.\n","authors":["Cornelius Hake","Jonas Weigele","Frederik Reichert","Christian Friedrich"],"pdf_url":"https://arxiv.org/pdf/2501.07317v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10700v2","updated":"2025-01-15T12:45:24Z","published":"2024-03-15T21:36:15Z","title":"Mind the Error! Detection and Localization of Instruction Errors in\n Vision-and-Language Navigation","summary":" Vision-and-Language Navigation in Continuous Environments (VLN-CE) is one of\nthe most intuitive yet challenging embodied AI tasks. Agents are tasked to\nnavigate towards a target goal by executing a set of low-level actions,\nfollowing a series of natural language instructions. All VLN-CE methods in the\nliterature assume that language instructions are exact. However, in practice,\ninstructions given by humans can contain errors when describing a spatial\nenvironment due to inaccurate memory or confusion. Current VLN-CE benchmarks do\nnot address this scenario, making the state-of-the-art methods in VLN-CE\nfragile in the presence of erroneous instructions from human users. For the\nfirst time, we propose a novel benchmark dataset that introduces various types\nof instruction errors considering potential human causes. This benchmark\nprovides valuable insight into the robustness of VLN systems in continuous\nenvironments. We observe a noticeable performance drop (up to -25%) in Success\nRate when evaluating the state-of-the-art VLN-CE methods on our benchmark.\nMoreover, we formally define the task of Instruction Error Detection and\nLocalization, and establish an evaluation protocol on top of our benchmark\ndataset. We also propose an effective method, based on a cross-modal\ntransformer architecture, that achieves the best performance in error detection\nand localization, compared to baselines. Surprisingly, our proposed method has\nrevealed errors in the validation set of the two commonly used datasets for\nVLN-CE, i.e., R2R-CE and RxR-CE, demonstrating the utility of our technique in\nother tasks. Code and dataset available at\nhttps://intelligolabs.github.io/R2RIE-CE\n","authors":["Francesco Taioli","Stefano Rosa","Alberto Castellini","Lorenzo Natale","Alessio Del Bue","Alessandro Farinelli","Marco Cristani","Yiming Wang"],"pdf_url":"https://arxiv.org/pdf/2403.10700v2.pdf","comment":"3 figures, 8 pages. Accepted at IROS'24"},{"id":"http://arxiv.org/abs/2403.13674v2","updated":"2025-01-15T11:27:53Z","published":"2024-03-20T15:32:56Z","title":"Reward-Driven Automated Curriculum Learning for Interaction-Aware\n Self-Driving at Unsignalized Intersections","summary":" In this work, we present a reward-driven automated curriculum reinforcement\nlearning approach for interaction-aware self-driving at unsignalized\nintersections, taking into account the uncertainties associated with\nsurrounding vehicles (SVs). These uncertainties encompass the uncertainty of\nSVs' driving intention and also the quantity of SVs. To deal with this problem,\nthe curriculum set is specifically designed to accommodate a progressively\nincreasing number of SVs. By implementing an automated curriculum selection\nmechanism, the importance weights are rationally allocated across various\ncurricula, thereby facilitating improved sample efficiency and training\noutcomes. Furthermore, the reward function is meticulously designed to guide\nthe agent towards effective policy exploration. Thus the proposed framework\ncould proactively address the above uncertainties at unsignalized intersections\nby employing the automated curriculum learning technique that progressively\nincreases task difficulty, and this ensures safe self-driving through effective\ninteraction with SVs. Comparative experiments are conducted in $Highway\\_Env$,\nand the results indicate that our approach achieves the highest task success\nrate, attains strong robustness to initialization parameters of the curriculum\nselection module, and exhibits superior adaptability to diverse situational\nconfigurations at unsignalized intersections. Furthermore, the effectiveness of\nthe proposed method is validated using the high-fidelity CARLA simulator.\n","authors":["Zengqi Peng","Xiao Zhou","Lei Zheng","Yubin Wang","Jun Ma"],"pdf_url":"https://arxiv.org/pdf/2403.13674v2.pdf","comment":"8 pages, 6 figures, add grant information, minor textual polishing"},{"id":"http://arxiv.org/abs/2501.08726v1","updated":"2025-01-15T11:10:34Z","published":"2025-01-15T11:10:34Z","title":"Task Allocation in Mobile Robot Fleets: A review","summary":" Mobile robot fleets are currently used in different scenarios such as medical\nenvironments or logistics. The management of these systems provides different\nchallenges that vary from the control of the movement of each robot to the\nallocation of tasks to be performed. Task Allocation (TA) problem is a key\ntopic for the proper management of mobile robot fleets to ensure the\nminimization of energy consumption and quantity of necessary robots. Solutions\non this aspect are essential to reach economic and environmental sustainability\nof robot fleets, mainly in industry applications such as warehouse logistics.\nThe minimization of energy consumption introduces TA problem as an optimization\nissue which has been treated in recent studies. This work focuses on the\nanalysis of current trends in solving TA of mobile robot fleets. Main TA\noptimization algorithms are presented, including novel methods based on\nArtificial Intelligence (AI). Additionally, this work showcases most important\nresults extracted from simulations, including frameworks utilized for the\ndevelopment of the simulations. Finally, some conclusions are obtained from the\nanalysis to target on gaps that must be treated in the future.\n","authors":["Andrés Meseguer Valenzuela","Francisco Blanes Noguera"],"pdf_url":"https://arxiv.org/pdf/2501.08726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08672v1","updated":"2025-01-15T09:04:56Z","published":"2025-01-15T09:04:56Z","title":"GS-LIVO: Real-Time LiDAR, Inertial, and Visual Multi-sensor Fused\n Odometry with Gaussian Mapping","summary":" In recent years, 3D Gaussian splatting (3D-GS) has emerged as a novel scene\nrepresentation approach. However, existing vision-only 3D-GS methods often rely\non hand-crafted heuristics for point-cloud densification and face challenges in\nhandling occlusions and high GPU memory and computation consumption.\nLiDAR-Inertial-Visual (LIV) sensor configuration has demonstrated superior\nperformance in localization and dense mapping by leveraging complementary\nsensing characteristics: rich texture information from cameras, precise\ngeometric measurements from LiDAR, and high-frequency motion data from IMU.\nInspired by this, we propose a novel real-time Gaussian-based simultaneous\nlocalization and mapping (SLAM) system. Our map system comprises a global\nGaussian map and a sliding window of Gaussians, along with an IESKF-based\nodometry. The global Gaussian map consists of hash-indexed voxels organized in\na recursive octree, effectively covering sparse spatial volumes while adapting\nto different levels of detail and scales. The Gaussian map is initialized\nthrough multi-sensor fusion and optimized with photometric gradients. Our\nsystem incrementally maintains a sliding window of Gaussians, significantly\nreducing GPU computation and memory consumption by only optimizing the map\nwithin the sliding window. Moreover, we implement a tightly coupled\nmulti-sensor fusion odometry with an iterative error state Kalman filter\n(IESKF), leveraging real-time updating and rendering of the Gaussian map. Our\nsystem represents the first real-time Gaussian-based SLAM framework deployable\non resource-constrained embedded systems, demonstrated on the NVIDIA Jetson\nOrin NX platform. The framework achieves real-time performance while\nmaintaining robust multi-sensor fusion capabilities. All implementation\nalgorithms, hardware designs, and CAD models will be publicly available.\n","authors":["Sheng Hong","Chunran Zheng","Yishu Shen","Changze Li","Fu Zhang","Tong Qin","Shaojie Shen"],"pdf_url":"https://arxiv.org/pdf/2501.08672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08655v1","updated":"2025-01-15T08:46:20Z","published":"2025-01-15T08:46:20Z","title":"Application of Deep Reinforcement Learning to UAV Swarming for Ground\n Surveillance","summary":" This paper summarizes in depth the state of the art of aerial swarms,\ncovering both classical and new reinforcement-learning-based approaches for\ntheir management. Then, it proposes a hybrid AI system, integrating deep\nreinforcement learning in a multi-agent centralized swarm architecture. The\nproposed system is tailored to perform surveillance of a specific area,\nsearching and tracking ground targets, for security and law enforcement\napplications. The swarm is governed by a central swarm controller responsible\nfor distributing different search and tracking tasks among the cooperating\nUAVs. Each UAV agent is then controlled by a collection of cooperative\nsub-agents, whose behaviors have been trained using different deep\nreinforcement learning models, tailored for the different task types proposed\nby the swarm controller. More specifically, proximal policy optimization (PPO)\nalgorithms were used to train the agents' behavior. In addition, several\nmetrics to assess the performance of the swarm in this application were\ndefined. The results obtained through simulation show that our system searches\nthe operation area effectively, acquires the targets in a reasonable time, and\nis capable of tracking them continuously and consistently.\n","authors":["Raúl Arranz","David Carramiñana","Gonzalo de Miguel","Juan A. Besada","Ana M. Bernardos"],"pdf_url":"https://arxiv.org/pdf/2501.08655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15627v2","updated":"2025-01-15T08:18:38Z","published":"2024-09-24T00:10:29Z","title":"ModCube: Modular, Self-Assembling Cubic Underwater Robot","summary":" This paper presents a low-cost, centralized modular underwater robot\nplatform, ModCube, which can be used to study swarm coordination for a wide\nrange of tasks in underwater environments. A ModCube structure consists of\nmultiple ModCube robots. Each robot can move in six DoF with eight thrusters\nand can be rigidly connected to other ModCube robots with an electromagnet\ncontrolled by onboard computer. In this paper, we present a novel method for\ncharacterizing and visualizing dynamic behavior, along with four benchmarks to\nevaluate the morphological performance of the robot. Analysis shows that our\nModCube design is desirable for omnidirectional tasks, compared with the\nconfigurations widely used by commercial underwater robots. We run real robot\nexperiments in two water tanks to demonstrate the robust control and\nself-assemble of the proposed system, We also open-source the design and code\nto facilitate future research.\n","authors":["Jiaxi Zheng","Guangmin Dai","Botao He","Zhaoyang Mu","Zhaochen Meng","Tianyi Zhang","Weiming Zhi","Dixia Fan"],"pdf_url":"https://arxiv.org/pdf/2409.15627v2.pdf","comment":"8 pages, 8 figures, letter"},{"id":"http://arxiv.org/abs/2501.06605v2","updated":"2025-01-15T08:01:51Z","published":"2025-01-11T18:11:07Z","title":"RoboHorizon: An LLM-Assisted Multi-View World Model for Long-Horizon\n Robotic Manipulation","summary":" Efficient control in long-horizon robotic manipulation is challenging due to\ncomplex representation and policy learning requirements. Model-based visual\nreinforcement learning (RL) has shown great potential in addressing these\nchallenges but still faces notable limitations, particularly in handling sparse\nrewards and complex visual features in long-horizon environments. To address\nthese limitations, we propose the Recognize-Sense-Plan-Act (RSPA) pipeline for\nlong-horizon tasks and further introduce RoboHorizon, an LLM-assisted\nmulti-view world model tailored for long-horizon robotic manipulation. In\nRoboHorizon, pre-trained LLMs generate dense reward structures for multi-stage\nsub-tasks based on task language instructions, enabling robots to better\nrecognize long-horizon tasks. Keyframe discovery is then integrated into the\nmulti-view masked autoencoder (MAE) architecture to enhance the robot's ability\nto sense critical task sequences, strengthening its multi-stage perception of\nlong-horizon processes. Leveraging these dense rewards and multi-view\nrepresentations, a robotic world model is constructed to efficiently plan\nlong-horizon tasks, enabling the robot to reliably act through RL algorithms.\nExperiments on two representative benchmarks, RLBench and FurnitureBench, show\nthat RoboHorizon outperforms state-of-the-art visual model-based RL methods,\nachieving a 23.35% improvement in task success rates on RLBench's 4\nshort-horizon tasks and a 29.23% improvement on 6 long-horizon tasks from\nRLBench and 3 furniture assembly tasks from FurnitureBench.\n","authors":["Zixuan Chen","Jing Huo","Yangtao Chen","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2501.06605v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2501.08629v1","updated":"2025-01-15T07:24:15Z","published":"2025-01-15T07:24:15Z","title":"Self-Organizing Edge Computing Distribution Framework for Visual SLAM","summary":" Localization within a known environment is a crucial capability for mobile\nrobots. Simultaneous Localization and Mapping (SLAM) is a prominent solution to\nthis problem. SLAM is a framework that consists of a diverse set of\ncomputational tasks ranging from real-time tracking to computation-intensive\nmap optimization. This combination can present a challenge for resource-limited\nmobile robots. Previously, edge-assisted SLAM methods have demonstrated\npromising real-time execution capabilities by offloading heavy computations\nwhile performing real-time tracking onboard. However, the common approach of\nutilizing a client-server architecture for offloading is sensitive to server\nand network failures. In this article, we propose a novel edge-assisted SLAM\nframework capable of self-organizing fully distributed SLAM execution across a\nnetwork of devices or functioning on a single device without connectivity. The\narchitecture consists of three layers and is designed to be device-agnostic,\nresilient to network failures, and minimally invasive to the core SLAM system.\nWe have implemented and demonstrated the framework for monocular ORB SLAM3 and\nevaluated it in both fully distributed and standalone SLAM configurations\nagainst the ORB SLAM3. The experiment results demonstrate that the proposed\ndesign matches the accuracy and resource utilization of the monolithic approach\nwhile enabling collaborative execution.\n","authors":["Jussi Kalliola","Lauri Suomela","Sergio Moreschini","David Hästbacka"],"pdf_url":"https://arxiv.org/pdf/2501.08629v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.08593v1","updated":"2025-01-15T05:36:41Z","published":"2025-01-15T05:36:41Z","title":"Image-to-Force Estimation for Soft Tissue Interaction in\n Robotic-Assisted Surgery Using Structured Light","summary":" For Minimally Invasive Surgical (MIS) robots, accurate haptic interaction\nforce feedback is essential for ensuring the safety of interacting with soft\ntissue. However, most existing MIS robotic systems cannot facilitate direct\nmeasurement of the interaction force with hardware sensors due to space\nlimitations. This letter introduces an effective vision-based scheme that\nutilizes a One-Shot structured light projection with a designed pattern on soft\ntissue coupled with haptic information processing through a trained\nimage-to-force neural network. The images captured from the endoscopic stereo\ncamera are analyzed to reconstruct high-resolution 3D point clouds for soft\ntissue deformation. Based on this, a modified PointNet-based force estimation\nmethod is proposed, which excels in representing the complex mechanical\nproperties of soft tissue. Numerical force interaction experiments are\nconducted on three silicon materials with different stiffness. The results\nvalidate the effectiveness of the proposed scheme.\n","authors":["Jiayin Wang","Mingfeng Yao","Yanran Wei","Xiaoyu Guo","Ayong Zheng","Weidong Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.08593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08575v1","updated":"2025-01-15T04:51:10Z","published":"2025-01-15T04:51:10Z","title":"GOTLoc: General Outdoor Text-based Localization Using Scene Graph\n Retrieval with OpenStreetMap","summary":" We propose GOTLoc, a robust localization method capable of operating even in\noutdoor environments where GPS signals are unavailable. The method achieves\nthis robust localization by leveraging comparisons between scene graphs\ngenerated from text descriptions and maps. Existing text-based localization\nstudies typically represent maps as point clouds and identify the most similar\nscenes by comparing embeddings of text and point cloud data. However, point\ncloud maps have limited scalability as it is impractical to pre-generate maps\nfor all outdoor spaces. Furthermore, their large data size makes it challenging\nto store and utilize them directly on actual robots. To address these issues,\nGOTLoc leverages compact data structures, such as scene graphs, to store\nspatial information, enabling individual robots to carry and utilize large\namounts of map data. Additionally, by utilizing publicly available map data,\nsuch as OpenStreetMap, which provides global information on outdoor spaces, we\neliminate the need for additional effort to create custom map data. For\nperformance evaluation, we utilized the KITTI360Pose dataset in conjunction\nwith corresponding OpenStreetMap data to compare the proposed method with\nexisting approaches. Our results demonstrate that the proposed method achieves\naccuracy comparable to algorithms relying on point cloud maps. Moreover, in\ncity-scale tests, GOTLoc required significantly less storage compared to point\ncloud-based methods and completed overall processing within a few seconds,\nvalidating its applicability to real-world robotics. Our code is available at\nhttps://github.com/donghwijung/GOTLoc.\n","authors":["Donghwi Jung","Keonwoo Kim","Seong-Woo Kim"],"pdf_url":"https://arxiv.org/pdf/2501.08575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08558v1","updated":"2025-01-15T03:49:08Z","published":"2025-01-15T03:49:08Z","title":"LAMS: LLM-Driven Automatic Mode Switching for Assistive Teleoperation","summary":" Teleoperating high degrees-of-freedom (DoF) robotic manipulators via low-DoF\ncontrollers like joysticks often requires frequent switching between control\nmodes, where each mode maps controller movements to specific robot actions.\nManually performing this frequent switching can make teleoperation cumbersome\nand inefficient. On the other hand, existing automatic mode-switching\nsolutions, such as heuristic-based or learning-based methods, are often\ntask-specific and lack generalizability. In this paper, we introduce LLM-Driven\nAutomatic Mode Switching (LAMS), a novel approach that leverages Large Language\nModels (LLMs) to automatically switch control modes based on task context.\nUnlike existing methods, LAMS requires no prior task demonstrations and\nincrementally improves by integrating user-generated mode-switching examples.\nWe validate LAMS through an ablation study and a user study with 10\nparticipants on complex, long-horizon tasks, demonstrating that LAMS\neffectively reduces manual mode switches, is preferred over alternative\nmethods, and improves performance over time. The project website with\nsupplementary materials is at https://lams-assistance.github.io/.\n","authors":["Yiran Tao","Jehan Yang","Dan Ding","Zackory Erickson"],"pdf_url":"https://arxiv.org/pdf/2501.08558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08520v1","updated":"2025-01-15T02:14:04Z","published":"2025-01-15T02:14:04Z","title":"Chance-Constrained Sampling-Based MPC for Collision Avoidance in\n Uncertain Dynamic Environments","summary":" Navigating safely in dynamic and uncertain environments is challenging due to\nuncertainties in perception and motion. This letter presents C2U-MPPI, a robust\nsampling-based Model Predictive Control (MPC) framework that addresses these\nchallenges by leveraging the Unscented Model Predictive Path Integral (U-MPPI)\ncontrol strategy with integrated probabilistic chance constraints, ensuring\nmore reliable and efficient navigation under uncertainty. Unlike gradient-based\nMPC methods, our approach (i) avoids linearization of system dynamics and\ndirectly applies non-convex and nonlinear chance constraints, enabling more\naccurate and flexible optimization, and (ii) enhances computational efficiency\nby reformulating probabilistic constraints into a deterministic form and\nemploying a layered dynamic obstacle representation, enabling real-time\nhandling of multiple obstacles. Extensive experiments in simulated and\nreal-world human-shared environments validate the effectiveness of our\nalgorithm against baseline methods, showcasing its capability to generate\nfeasible trajectories and control inputs that adhere to system dynamics and\nconstraints in dynamic settings, enabled by unscented-based sampling strategy\nand risk-sensitive trajectory evaluation. A supplementary video is available\nat: https://youtu.be/FptAhvJlQm8\n","authors":["Ihab S. Mohamed","Mahmoud Ali","Lantao Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08520v1.pdf","comment":"This paper has 8 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2412.01168v3","updated":"2025-01-15T01:31:09Z","published":"2024-12-02T06:10:23Z","title":"On the Surprising Effectiveness of Spectrum Clipping in Learning Stable\n Linear Dynamics","summary":" When learning stable linear dynamical systems from data, three important\nproperties are desirable: i) predictive accuracy, ii) provable stability, and\niii) computational efficiency. Unconstrained minimization of reconstruction\nerrors leads to high accuracy and efficiency but cannot guarantee stability.\nExisting methods to remedy this focus on enforcing stability while also\nensuring accuracy, but do so only at the cost of increased computation. In this\nwork, we investigate if a straightforward approach can simultaneously offer all\nthree desiderata of learning stable linear systems. Specifically, we consider a\npost-hoc approach that manipulates the spectrum of the learned system matrix\nafter it is learned in an unconstrained fashion. We call this approach spectrum\nclipping (SC) as it involves eigen decomposition and subsequent reconstruction\nof the system matrix after clipping all of its eigenvalues that are larger than\none to one (without altering the eigenvectors). Through detailed experiments\ninvolving two different applications and publicly available benchmark datasets,\nwe demonstrate that this simple technique can simultaneously learn highly\naccurate linear systems that are provably stable. Notably, we demonstrate that\nSC can achieve similar or better performance than strong baselines while being\norders-of-magnitude faster. We also show that SC can be readily combined with\nKoopman operators to learn stable nonlinear dynamics, such as those underlying\ncomplex dexterous manipulation skills involving multi-fingered robotic hands.\nFurther, we find that SC can learn stable robot policies even when the training\ndata includes unsuccessful or truncated demonstrations. Our codes and dataset\ncan be found at https://github.com/GT-STAR-Lab/spec_clip.\n","authors":["Hanyao Guo","Yunhai Han","Harish Ravichandar"],"pdf_url":"https://arxiv.org/pdf/2412.01168v3.pdf","comment":"Under review by L4DC 2025"},{"id":"http://arxiv.org/abs/2501.08507v1","updated":"2025-01-15T01:09:11Z","published":"2025-01-15T01:09:11Z","title":"A Framework for Dynamic Situational Awareness in Human Robot Teams: An\n Interview Study","summary":" In human-robot teams, human situational awareness is the operator's conscious\nknowledge of the team's states, actions, plans and their environment.\nAppropriate human situational awareness is critical to successful human-robot\ncollaboration. In human-robot teaming, it is often assumed that the best and\nrequired level of situational awareness is knowing everything at all times.\nThis view is problematic, because what a human needs to know for optimal team\nperformance varies given the dynamic environmental conditions, task context and\nroles and capabilities of team members. We explore this topic by interviewing\n16 participants with active and repeated experience in diverse human-robot\nteaming applications. Based on analysis of these interviews, we derive a\nframework explaining the dynamic nature of required situational awareness in\nhuman-robot teaming. In addition, we identify a range of factors affecting the\ndynamic nature of required and actual levels of situational awareness (i.e.,\ndynamic situational awareness), types of situational awareness inefficiencies\nresulting from gaps between actual and required situational awareness, and\ntheir main consequences. We also reveal various strategies, initiated by humans\nand robots, that assist in maintaining the required situational awareness. Our\nfindings inform the implementation of accurate estimates of dynamic situational\nawareness and the design of user-adaptive human-robot interfaces. Therefore,\nthis work contributes to the future design of more collaborative and effective\nhuman-robot teams.\n","authors":["Hashini Senaratne","Leimin Tian","Pavan Sikka","Jason Williams","David Howard","Dana Kulić","Cécile Paris"],"pdf_url":"https://arxiv.org/pdf/2501.08507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09203v1","updated":"2025-01-15T23:36:05Z","published":"2025-01-15T23:36:05Z","title":"Unified Few-shot Crack Segmentation and its Precise 3D Automatic\n Measurement in Concrete Structures","summary":" Visual-Spatial Systems has become increasingly essential in concrete crack\ninspection. However, existing methods often lacks adaptability to diverse\nscenarios, exhibits limited robustness in image-based approaches, and struggles\nwith curved or complex geometries. To address these limitations, an innovative\nframework for two-dimensional (2D) crack detection, three-dimensional (3D)\nreconstruction, and 3D automatic crack measurement was proposed by integrating\ncomputer vision technologies and multi-modal Simultaneous localization and\nmapping (SLAM) in this study. Firstly, building on a base DeepLabv3+\nsegmentation model, and incorporating specific refinements utilizing foundation\nmodel Segment Anything Model (SAM), we developed a crack segmentation method\nwith strong generalization across unfamiliar scenarios, enabling the generation\nof precise 2D crack masks. To enhance the accuracy and robustness of 3D\nreconstruction, Light Detection and Ranging (LiDAR) point clouds were utilized\ntogether with image data and segmentation masks. By leveraging both image- and\nLiDAR-SLAM, we developed a multi-frame and multi-modal fusion framework that\nproduces dense, colorized point clouds, effectively capturing crack semantics\nat a 3D real-world scale. Furthermore, the crack geometric attributions were\nmeasured automatically and directly within 3D dense point cloud space,\nsurpassing the limitations of conventional 2D image-based measurements. This\nadvancement makes the method suitable for structural components with curved and\ncomplex 3D geometries. Experimental results across various concrete structures\nhighlight the significant improvements and unique advantages of the proposed\nmethod, demonstrating its effectiveness, accuracy, and robustness in real-world\napplications.\n","authors":["Pengru Deng","Jiapeng Yao","Chun Li","Su Wang","Xinrun Li","Varun Ojha","Xuhui He","Takashi Matsumoto"],"pdf_url":"https://arxiv.org/pdf/2501.09203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09198v1","updated":"2025-01-15T23:12:38Z","published":"2025-01-15T23:12:38Z","title":"Combining Movement Primitives with Contraction Theory","summary":" This paper presents a modular framework for motion planning using movement\nprimitives. Central to the approach is Contraction Theory, a modular stability\ntool for nonlinear dynamical systems. The approach extends prior methods by\nachieving parallel and sequential combinations of both discrete and rhythmic\nmovements, while enabling independent modulation of each movement. This modular\nframework enables a divide-and-conquer strategy to simplify the programming of\ncomplex robot motion planning. Simulation examples illustrate the flexibility\nand versatility of the framework, highlighting its potential to address diverse\nchallenges in robot motion planning.\n","authors":["Moses C. Nah","Johannes Lachner","Neville Hogan","Jean-Jacques Slotine"],"pdf_url":"https://arxiv.org/pdf/2501.09198v1.pdf","comment":"8 pages, 4 figures, submitted to Robotics and Automation Letters\n (RA-L) for review"},{"id":"http://arxiv.org/abs/2501.09192v1","updated":"2025-01-15T22:50:02Z","published":"2025-01-15T22:50:02Z","title":"Estimation-Aware Trajectory Optimization with Set-Valued Measurement\n Uncertainties","summary":" In this paper, we present an optimization-based framework for generating\nestimation-aware trajectories in scenarios where measurement (output)\nuncertainties are state-dependent and set-valued. The framework leverages the\nconcept of regularity for set-valued output maps. Specifically, we demonstrate\nthat, for output-regular maps, one can utilize a set-valued observability\nmeasure that is concave with respect to finite-horizon state trajectories. By\nmaximizing this measure, optimized estimation-aware trajectories can be\ndesigned for a broad class of systems, including those with locally linearized\ndynamics. To illustrate the effectiveness of the proposed approach, we provide\na representative example in the context of trajectory planning for vision-based\nestimation. We present an estimation-aware trajectory for an uncooperative\ntarget-tracking problem that uses a machine learning (ML)-based estimation\nmodule on an ego-satellite.\n","authors":["Aditya Deole","Mehran Mesbahi"],"pdf_url":"https://arxiv.org/pdf/2501.09192v1.pdf","comment":"25 pages, 5 figures"},{"id":"http://arxiv.org/abs/2501.09167v1","updated":"2025-01-15T21:36:19Z","published":"2025-01-15T21:36:19Z","title":"Embodied Scene Understanding for Vision Language Models via MetaVQA","summary":" Vision Language Models (VLMs) demonstrate significant potential as embodied\nAI agents for various mobility applications. However, a standardized,\nclosed-loop benchmark for evaluating their spatial reasoning and sequential\ndecision-making capabilities is lacking. To address this, we present MetaVQA: a\ncomprehensive benchmark designed to assess and enhance VLMs' understanding of\nspatial relationships and scene dynamics through Visual Question Answering\n(VQA) and closed-loop simulations. MetaVQA leverages Set-of-Mark prompting and\ntop-down view ground-truth annotations from nuScenes and Waymo datasets to\nautomatically generate extensive question-answer pairs based on diverse\nreal-world traffic scenarios, ensuring object-centric and context-rich\ninstructions. Our experiments show that fine-tuning VLMs with the MetaVQA\ndataset significantly improves their spatial reasoning and embodied scene\ncomprehension in safety-critical simulations, evident not only in improved VQA\naccuracies but also in emerging safety-aware driving maneuvers. In addition,\nthe learning demonstrates strong transferability from simulation to real-world\nobservation. Code and data will be publicly available at\nhttps://metadriverse.github.io/metavqa .\n","authors":["Weizhen Wang","Chenda Duan","Zhenghao Peng","Yuxin Liu","Bolei Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.09167v1.pdf","comment":"for the project webpage, see https://metadriverse.github.io/metavqa"},{"id":"http://arxiv.org/abs/2501.09160v1","updated":"2025-01-15T21:22:09Z","published":"2025-01-15T21:22:09Z","title":"AutoLoop: Fast Visual SLAM Fine-tuning through Agentic Curriculum\n Learning","summary":" Current visual SLAM systems face significant challenges in balancing\ncomputational efficiency with robust loop closure handling. Traditional\napproaches require careful manual tuning and incur substantial computational\noverhead, while learning-based methods either lack explicit loop closure\ncapabilities or implement them through computationally expensive methods. We\npresent AutoLoop, a novel approach that combines automated curriculum learning\nwith efficient fine-tuning for visual SLAM systems. Our method employs a DDPG\n(Deep Deterministic Policy Gradient) agent to dynamically adjust loop closure\nweights during training, eliminating the need for manual hyperparameter search\nwhile significantly reducing the required training steps. The approach\npre-computes potential loop closure pairs offline and leverages them through an\nagent-guided curriculum, allowing the model to adapt efficiently to new\nscenarios. Experiments conducted on TartanAir for training and validated across\nmultiple benchmarks including KITTI, EuRoC, ICL-NUIM and TUM RGB-D demonstrate\nthat AutoLoop achieves comparable or superior performance while reducing\ntraining time by an order of magnitude compared to traditional approaches.\nAutoLoop provides a practical solution for rapid adaptation of visual SLAM\nsystems, automating the weight tuning process that traditionally requires\nmultiple manual iterations. Our results show that this automated curriculum\nstrategy not only accelerates training but also maintains or improves the\nmodel's performance across diverse environmental conditions.\n","authors":["Assaf Lahiany","Oren Gal"],"pdf_url":"https://arxiv.org/pdf/2501.09160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.04999v2","updated":"2025-01-15T04:45:12Z","published":"2022-09-12T03:12:04Z","title":"Experimental Study on The Effect of Multi-step Deep Reinforcement\n Learning in POMDPs","summary":" Deep Reinforcement Learning (DRL) has made tremendous advances in both\nsimulated and real-world robot control tasks in recent years. This is\nparticularly the case for tasks that can be carefully engineered with a full\nstate representation, and which can then be formulated as a Markov Decision\nProcess (MDP). However, applying DRL strategies designed for MDPs to novel\nrobot control tasks can be challenging, because the available observations may\nbe a partial representation of the state, resulting in a Partially Observable\nMarkov Decision Process (POMDP). This paper considers three popular DRL\nalgorithms, namely Proximal Policy Optimization (PPO), Twin Delayed Deep\nDeterministic Policy Gradient (TD3), and Soft Actor-Critic (SAC), invented for\nMDPs, and studies their performance in POMDP scenarios. While prior work has\nfound that SAC and TD3 typically outperform PPO across a broad range of tasks\nthat can be represented as MDPs, we show that this is not always the case,\nusing three representative POMDP environments. Empirical studies show that this\nis related to multi-step bootstrapping, where multi-step immediate rewards,\ninstead of one-step immediate reward, are used to calculate the target value\nestimation of an observation and action pair. We identify this by observing\nthat the inclusion of multi-step bootstrapping in TD3 (MTD3) and SAC (MSAC)\nresults in improved robustness in POMDP settings.\n","authors":["Lingheng Meng","Rob Gorbet","Michael Burke","Dana Kulić"],"pdf_url":"https://arxiv.org/pdf/2209.04999v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.09019v1","updated":"2025-01-15T18:59:15Z","published":"2025-01-15T18:59:15Z","title":"Ouroboros-Diffusion: Exploring Consistent Content Generation in\n Tuning-free Long Video Diffusion","summary":" The first-in-first-out (FIFO) video diffusion, built on a pre-trained\ntext-to-video model, has recently emerged as an effective approach for\ntuning-free long video generation. This technique maintains a queue of video\nframes with progressively increasing noise, continuously producing clean frames\nat the queue's head while Gaussian noise is enqueued at the tail. However,\nFIFO-Diffusion often struggles to keep long-range temporal consistency in the\ngenerated videos due to the lack of correspondence modeling across frames. In\nthis paper, we propose Ouroboros-Diffusion, a novel video denoising framework\ndesigned to enhance structural and content (subject) consistency, enabling the\ngeneration of consistent videos of arbitrary length. Specifically, we introduce\na new latent sampling technique at the queue tail to improve structural\nconsistency, ensuring perceptually smooth transitions among frames. To enhance\nsubject consistency, we devise a Subject-Aware Cross-Frame Attention (SACFA)\nmechanism, which aligns subjects across frames within short segments to achieve\nbetter visual coherence. Furthermore, we introduce self-recurrent guidance.\nThis technique leverages information from all previous cleaner frames at the\nfront of the queue to guide the denoising of noisier frames at the end,\nfostering rich and contextual global information interaction. Extensive\nexperiments of long video generation on the VBench benchmark demonstrate the\nsuperiority of our Ouroboros-Diffusion, particularly in terms of subject\nconsistency, motion smoothness, and temporal consistency.\n","authors":["Jingyuan Chen","Fuchen Long","Jie An","Zhaofan Qiu","Ting Yao","Jiebo Luo","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2501.09019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14505v2","updated":"2025-01-15T18:57:31Z","published":"2024-07-19T17:58:36Z","title":"T2V-CompBench: A Comprehensive Benchmark for Compositional Text-to-video\n Generation","summary":" Text-to-video (T2V) generative models have advanced significantly, yet their\nability to compose different objects, attributes, actions, and motions into a\nvideo remains unexplored. Previous text-to-video benchmarks also neglect this\nimportant ability for evaluation. In this work, we conduct the first systematic\nstudy on compositional text-to-video generation. We propose T2V-CompBench, the\nfirst benchmark tailored for compositional text-to-video generation.\nT2V-CompBench encompasses diverse aspects of compositionality, including\nconsistent attribute binding, dynamic attribute binding, spatial relationships,\nmotion binding, action binding, object interactions, and generative numeracy.\nWe further carefully design evaluation metrics of multimodal large language\nmodel (MLLM)-based, detection-based, and tracking-based metrics, which can\nbetter reflect the compositional text-to-video generation quality of seven\nproposed categories with 1400 text prompts. The effectiveness of the proposed\nmetrics is verified by correlation with human evaluations. We also benchmark\nvarious text-to-video generative models and conduct in-depth analysis across\ndifferent models and various compositional categories. We find that\ncompositional text-to-video generation is highly challenging for current\nmodels, and we hope our attempt could shed light on future research in this\ndirection.\n","authors":["Kaiyue Sun","Kaiyi Huang","Xian Liu","Yue Wu","Zihan Xu","Zhenguo Li","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2407.14505v2.pdf","comment":"Project page: https://t2v-compbench-2025.github.io/ Code:\n https://github.com/KaiyueSun98/T2V-CompBench/tree/V2"},{"id":"http://arxiv.org/abs/2501.09012v1","updated":"2025-01-15T18:56:22Z","published":"2025-01-15T18:56:22Z","title":"Multimodal LLMs Can Reason about Aesthetics in Zero-Shot","summary":" We present the first study on how Multimodal LLMs' (MLLMs) reasoning ability\nshall be elicited to evaluate the aesthetics of artworks. To facilitate this\ninvestigation, we construct MM-StyleBench, a novel high-quality dataset for\nbenchmarking artistic stylization. We then develop a principled method for\nhuman preference modeling and perform a systematic correlation analysis between\nMLLMs' responses and human preference. Our experiments reveal an inherent\nhallucination issue of MLLMs in art evaluation, associated with response\nsubjectivity. ArtCoT is proposed, demonstrating that art-specific task\ndecomposition and the use of concrete language boost MLLMs' reasoning ability\nfor aesthetics. Our findings offer valuable insights into MLLMs for art and can\nbenefit a wide range of downstream applications, such as style transfer and\nartistic image generation. Code available at\nhttps://github.com/songrise/MLLM4Art.\n","authors":["Ruixiang Jiang","Changwen Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09012v1.pdf","comment":"WIP, Homepage https://github.com/songrise/MLLM4Art"},{"id":"http://arxiv.org/abs/2501.09008v1","updated":"2025-01-15T18:48:38Z","published":"2025-01-15T18:48:38Z","title":"SimGen: A Diffusion-Based Framework for Simultaneous Surgical Image and\n Segmentation Mask Generation","summary":" Acquiring and annotating surgical data is often resource-intensive, ethical\nconstraining, and requiring significant expert involvement. While generative AI\nmodels like text-to-image can alleviate data scarcity, incorporating spatial\nannotations, such as segmentation masks, is crucial for precision-driven\nsurgical applications, simulation, and education. This study introduces both a\nnovel task and method, SimGen, for Simultaneous Image and Mask Generation.\nSimGen is a diffusion model based on the DDPM framework and Residual U-Net,\ndesigned to jointly generate high-fidelity surgical images and their\ncorresponding segmentation masks. The model leverages cross-correlation priors\nto capture dependencies between continuous image and discrete mask\ndistributions. Additionally, a Canonical Fibonacci Lattice (CFL) is employed to\nenhance class separability and uniformity in the RGB space of the masks. SimGen\ndelivers high-fidelity images and accurate segmentation masks, outperforming\nbaselines across six public datasets assessed on image and semantic inception\ndistance metrics. Ablation study shows that the CFL improves mask quality and\nspatial separation. Downstream experiments suggest generated image-mask pairs\nare usable if regulations limit human data release for research. This work\noffers a cost-effective solution for generating paired surgical images and\ncomplex labels, advancing surgical AI development by reducing the need for\nexpensive manual annotations.\n","authors":["Aditya Bhat","Rupak Bose","Chinedu Innocent Nwoye","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2501.09008v1.pdf","comment":"12 pages, 17 figures, 4 tables, project page at\n https://camma-public.github.io/endogen/"},{"id":"http://arxiv.org/abs/2403.13163v5","updated":"2025-01-15T18:45:15Z","published":"2024-03-19T21:31:31Z","title":"DeblurDiNAT: A Compact Model with Exceptional Generalization and Visual\n Fidelity on Unseen Domains","summary":" Recent deblurring networks have effectively restored clear images from the\nblurred ones. However, they often struggle with generalization to unknown\ndomains. Moreover, these models typically focus on distortion metrics such as\nPSNR and SSIM, neglecting the critical aspect of metrics aligned with human\nperception. To address these limitations, we propose DeblurDiNAT, a deblurring\nTransformer based on Dilated Neighborhood Attention. First, DeblurDiNAT employs\nan alternating dilation factor paradigm to capture both local and global\nblurred patterns, enhancing generalization and perceptual clarity. Second, a\nlocal cross-channel learner aids the Transformer block to understand the\nshort-range relationships between adjacent channels. Additionally, we present a\nlinear feed-forward network with a simple while effective design. Finally, a\ndual-stage feature fusion module is introduced as an alternative to the\nexisting approach, which efficiently process multi-scale visual information\nacross network levels. Compared to state-of-the-art models, our compact\nDeblurDiNAT demonstrates superior generalization capabilities and achieves\nremarkable performance in perceptual metrics, while maintaining a favorable\nmodel size.\n","authors":["Hanzhou Liu","Binghan Li","Chengkai Liu","Mi Lu"],"pdf_url":"https://arxiv.org/pdf/2403.13163v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09001v1","updated":"2025-01-15T18:30:58Z","published":"2025-01-15T18:30:58Z","title":"Vision Foundation Models for Computed Tomography","summary":" Foundation models (FMs) have shown transformative potential in radiology by\nperforming diverse, complex tasks across imaging modalities. Here, we developed\nCT-FM, a large-scale 3D image-based pre-trained model designed explicitly for\nvarious radiological tasks. CT-FM was pre-trained using 148,000 computed\ntomography (CT) scans from the Imaging Data Commons through label-agnostic\ncontrastive learning. We evaluated CT-FM across four categories of tasks,\nnamely, whole-body and tumor segmentation, head CT triage, medical image\nretrieval, and semantic understanding, showing superior performance against\nstate-of-the-art models. Beyond quantitative success, CT-FM demonstrated the\nability to cluster regions anatomically and identify similar anatomical and\nstructural concepts across scans. Furthermore, it remained robust across\ntest-retest settings and indicated reasonable salient regions attached to its\nembeddings. This study demonstrates the value of large-scale medical imaging\nfoundation models and by open-sourcing the model weights, code, and data, aims\nto support more adaptable, reliable, and interpretable AI solutions in\nradiology.\n","authors":["Suraj Pai","Ibrahim Hadzic","Dennis Bontempi","Keno Bressem","Benjamin H. Kann","Andriy Fedorov","Raymond H. Mak","Hugo J. W. L. Aerts"],"pdf_url":"https://arxiv.org/pdf/2501.09001v1.pdf","comment":"6 figures, followed by 9 Extended Data Figures and a Supplementary\n Information document"},{"id":"http://arxiv.org/abs/2501.01557v2","updated":"2025-01-15T18:29:56Z","published":"2025-01-02T22:24:13Z","title":"Click-Calib: A Robust Extrinsic Calibration Method for Surround-View\n Systems","summary":" Surround-View System (SVS) is an essential component in Advanced Driver\nAssistance System (ADAS) and requires precise calibrations. However,\nconventional offline extrinsic calibration methods are cumbersome and\ntime-consuming as they rely heavily on physical patterns. Additionally, these\nmethods primarily focus on short-range areas surrounding the vehicle, resulting\nin lower calibration quality in more distant zones. To address these\nlimitations, we propose Click-Calib, a pattern-free approach for offline SVS\nextrinsic calibration. Without requiring any special setup, the user only needs\nto click a few keypoints on the ground in natural scenes. Unlike other offline\ncalibration approaches, Click-Calib optimizes camera poses over a wide range by\nminimizing reprojection distance errors of keypoints, thereby achieving\naccurate calibrations at both short and long distances. Furthermore,\nClick-Calib supports both single-frame and multiple-frame modes, with the\nlatter offering even better results. Evaluations on our in-house dataset and\nthe public WoodScape dataset demonstrate its superior accuracy and robustness\ncompared to baseline methods. Code is available at\nhttps://github.com/lwangvaleo/click_calib.\n","authors":["Lihao Wang"],"pdf_url":"https://arxiv.org/pdf/2501.01557v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06848v2","updated":"2025-01-15T18:28:37Z","published":"2025-01-12T15:34:24Z","title":"A General Framework for Inference-time Scaling and Steering of Diffusion\n Models","summary":" Diffusion models produce impressive results in modalities ranging from images\nand video to protein design and text. However, generating samples with\nuser-specified properties remains a challenge. Recent research proposes\nfine-tuning models to maximize rewards that capture desired properties, but\nthese methods require expensive training and are prone to mode collapse. In\nthis work, we propose Feynman Kac (FK) steering, an inference-time framework\nfor steering diffusion models with reward functions. FK steering works by\nsampling a system of multiple interacting diffusion processes, called\nparticles, and resampling particles at intermediate steps based on scores\ncomputed using functions called potentials. Potentials are defined using\nrewards for intermediate states and are selected such that a high value\nindicates that the particle will yield a high-reward sample. We explore various\nchoices of potentials, intermediate rewards, and samplers. We evaluate FK\nsteering on text-to-image and text diffusion models. For steering text-to-image\nmodels with a human preference reward, we find that FK steering a 0.8B\nparameter model outperforms a 2.6B parameter fine-tuned model on prompt\nfidelity, with faster sampling and no training. For steering text diffusion\nmodels with rewards for text quality and specific text attributes, we find that\nFK steering generates lower perplexity, more linguistically acceptable outputs\nand enables gradient-free control of attributes like toxicity. Our results\ndemonstrate that inference-time scaling and steering of diffusion models, even\nwith off-the-shelf rewards, can provide significant sample quality gains and\ncontrollability benefits. Code is available at\nhttps://github.com/zacharyhorvitz/Fk-Diffusion-Steering .\n","authors":["Raghav Singhal","Zachary Horvitz","Ryan Teehan","Mengye Ren","Zhou Yu","Kathleen McKeown","Rajesh Ranganath"],"pdf_url":"https://arxiv.org/pdf/2501.06848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08994v1","updated":"2025-01-15T18:20:37Z","published":"2025-01-15T18:20:37Z","title":"RepVideo: Rethinking Cross-Layer Representation for Video Generation","summary":" Video generation has achieved remarkable progress with the introduction of\ndiffusion models, which have significantly improved the quality of generated\nvideos. However, recent research has primarily focused on scaling up model\ntraining, while offering limited insights into the direct impact of\nrepresentations on the video generation process. In this paper, we initially\ninvestigate the characteristics of features in intermediate layers, finding\nsubstantial variations in attention maps across different layers. These\nvariations lead to unstable semantic representations and contribute to\ncumulative differences between features, which ultimately reduce the similarity\nbetween adjacent frames and negatively affect temporal coherence. To address\nthis, we propose RepVideo, an enhanced representation framework for\ntext-to-video diffusion models. By accumulating features from neighboring\nlayers to form enriched representations, this approach captures more stable\nsemantic information. These enhanced representations are then used as inputs to\nthe attention mechanism, thereby improving semantic expressiveness while\nensuring feature consistency across adjacent frames. Extensive experiments\ndemonstrate that our RepVideo not only significantly enhances the ability to\ngenerate accurate spatial appearances, such as capturing complex spatial\nrelationships between multiple objects, but also improves temporal consistency\nin video generation.\n","authors":["Chenyang Si","Weichen Fan","Zhengyao Lv","Ziqi Huang","Yu Qiao","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08994v1.pdf","comment":"Project page: https://vchitect.github.io/RepVid-Webpage"},{"id":"http://arxiv.org/abs/2409.01998v2","updated":"2025-01-15T18:07:13Z","published":"2024-09-03T15:43:44Z","title":"SA-MLP: A Low-Power Multiplication-Free Deep Network for 3D Point Cloud\n Classification in Resource-Constrained Environments","summary":" Point cloud classification plays a crucial role in the processing and\nanalysis of data from 3D sensors such as LiDAR, which are commonly used in\napplications like autonomous vehicles, robotics, and environmental monitoring.\nHowever, traditional neural networks, which rely heavily on multiplication\noperations, often face challenges in terms of high computational costs and\nenergy consumption. This study presents a novel family of efficient MLP-based\narchitectures designed to improve the computational efficiency of point cloud\nclassification tasks in sensor systems. The baseline model, Mul-MLP, utilizes\nconventional multiplication operations, while Add-MLP and Shift-MLP replace\nmultiplications with addition and shift operations, respectively. These\nreplacements leverage more sensor-friendly operations that can significantly\nreduce computational overhead, making them particularly suitable for\nresource-constrained sensor platforms. To further enhance performance, we\npropose SA-MLP, a hybrid architecture that alternates between shift and adder\nlayers, preserving the network depth while optimizing computational efficiency.\nUnlike previous approaches such as ShiftAddNet, which increase the layer count\nand limit representational capacity by freezing shift weights, SA-MLP fully\nexploits the complementary advantages of shift and adder layers by employing\ndistinct learning rates and optimizers. Experimental results show that Add-MLP\nand Shift-MLP achieve competitive performance compared to Mul-MLP, while SA-MLP\nsurpasses the baseline, delivering results comparable to state-of-the-art MLP\nmodels in terms of both classification accuracy and computational efficiency.\nThis work offers a promising, energy-efficient solution for sensor-driven\napplications requiring real-time point cloud classification, particularly in\nenvironments with limited computational resources.\n","authors":["Qiang Zheng","Chao Zhang","Jian Sun"],"pdf_url":"https://arxiv.org/pdf/2409.01998v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08983v1","updated":"2025-01-15T17:59:56Z","published":"2025-01-15T17:59:56Z","title":"CityDreamer4D: Compositional Generative Model of Unbounded 4D Cities","summary":" 3D scene generation has garnered growing attention in recent years and has\nmade significant progress. Generating 4D cities is more challenging than 3D\nscenes due to the presence of structurally complex, visually diverse objects\nlike buildings and vehicles, and heightened human sensitivity to distortions in\nurban environments. To tackle these issues, we propose CityDreamer4D, a\ncompositional generative model specifically tailored for generating unbounded\n4D cities. Our main insights are 1) 4D city generation should separate dynamic\nobjects (e.g., vehicles) from static scenes (e.g., buildings and roads), and 2)\nall objects in the 4D scene should be composed of different types of neural\nfields for buildings, vehicles, and background stuff. Specifically, we propose\nTraffic Scenario Generator and Unbounded Layout Generator to produce dynamic\ntraffic scenarios and static city layouts using a highly compact BEV\nrepresentation. Objects in 4D cities are generated by combining stuff-oriented\nand instance-oriented neural fields for background stuff, buildings, and\nvehicles. To suit the distinct characteristics of background stuff and\ninstances, the neural fields employ customized generative hash grids and\nperiodic positional embeddings as scene parameterizations. Furthermore, we\noffer a comprehensive suite of datasets for city generation, including OSM,\nGoogleEarth, and CityTopia. The OSM dataset provides a variety of real-world\ncity layouts, while the Google Earth and CityTopia datasets deliver\nlarge-scale, high-quality city imagery complete with 3D instance annotations.\nLeveraging its compositional design, CityDreamer4D supports a range of\ndownstream applications, such as instance editing, city stylization, and urban\nsimulation, while delivering state-of-the-art performance in generating\nrealistic 4D cities.\n","authors":["Haozhe Xie","Zhaoxi Chen","Fangzhou Hong","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08982v1","updated":"2025-01-15T17:59:32Z","published":"2025-01-15T17:59:32Z","title":"CityLoc: 6 DoF Localization of Text Descriptions in Large-Scale Scenes\n with Gaussian Representation","summary":" Localizing text descriptions in large-scale 3D scenes is inherently an\nambiguous task. This nonetheless arises while describing general concepts, e.g.\nall traffic lights in a city.\n To facilitate reasoning based on such concepts, text localization in the form\nof distribution is required. In this paper, we generate the distribution of the\ncamera poses conditioned upon the textual description.\n To facilitate such generation, we propose a diffusion-based architecture that\nconditionally diffuses the noisy 6DoF camera poses to their plausible\nlocations.\n The conditional signals are derived from the text descriptions, using the\npre-trained text encoders. The connection between text descriptions and pose\ndistribution is established through pretrained Vision-Language-Model, i.e.\nCLIP. Furthermore, we demonstrate that the candidate poses for the distribution\ncan be further refined by rendering potential poses using 3D Gaussian\nsplatting, guiding incorrectly posed samples towards locations that better\nalign with the textual description, through visual reasoning.\n We demonstrate the effectiveness of our method by comparing it with both\nstandard retrieval methods and learning-based approaches. Our proposed method\nconsistently outperforms these baselines across all five large-scale datasets.\nOur source code and dataset will be made publicly available.\n","authors":["Qi Ma","Runyi Yang","Bin Ren","Ender Konukoglu","Luc Van Gool","Danda Pani Paudel"],"pdf_url":"https://arxiv.org/pdf/2501.08982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06394v5","updated":"2025-01-15T17:56:35Z","published":"2023-11-10T20:50:36Z","title":"A design of Convolutional Neural Network model for the Diagnosis of the\n COVID-19","summary":" With the spread of COVID-19 around the globe over the past year, the usage of\nartificial intelligence (AI) algorithms and image processing methods to analyze\nthe X-ray images of patients' chest with COVID-19 has become essential. The\nCOVID-19 virus recognition in the lung area of a patient is one of the basic\nand essential needs of clicical centers and hospitals. Most research in this\nfield has been devoted to papers on the basis of deep learning methods\nutilizing CNNs (Convolutional Neural Network), which mainly deal with the\nscreening of sick and healthy people.In this study, a new structure of a\n19-layer CNN has been recommended for accurately recognition of the COVID-19\nfrom the X-ray pictures of chest. The offered CNN is developed to serve as a\nprecise diagnosis system for a three class (viral pneumonia, Normal, COVID) and\na four classclassification (Lung opacity, Normal, COVID-19, and pneumonia). A\ncomparison is conducted among the outcomes of the offered procedure and some\npopular pretrained networks, including Inception, Alexnet, ResNet50,\nSqueezenet, and VGG19 and based on Specificity, Accuracy, Precision,\nSensitivity, Confusion Matrix, and F1-score. The experimental results of the\noffered CNN method specify its dominance over the existing published\nprocedures. This method can be a useful tool for clinicians in deciding\nproperly about COVID-19.\n","authors":["Xinyuan Song"],"pdf_url":"https://arxiv.org/pdf/2311.06394v5.pdf","comment":"Important mistakes found. There's no new version currently. Also\n contradiction with authorship"},{"id":"http://arxiv.org/abs/2501.05179v2","updated":"2025-01-15T17:34:26Z","published":"2025-01-09T11:57:58Z","title":"Compression with Global Guidance: Towards Training-free High-Resolution\n MLLMs Acceleration","summary":" Multimodal large language models (MLLMs) have attracted considerable\nattention due to their exceptional performance in visual content understanding\nand reasoning. However, their inference efficiency has been a notable concern,\nas the increasing length of multimodal contexts leads to quadratic complexity.\nToken compression techniques, which reduce the number of visual tokens, have\ndemonstrated their effectiveness in reducing computational costs. Yet, these\napproaches have struggled to keep pace with the rapid advancements in MLLMs,\nespecially the AnyRes strategy in the context of high-resolution image\nunderstanding. In this paper, we propose a novel token compression method,\nGlobalCom$^2$, tailored for high-resolution MLLMs that receive both the\nthumbnail and multiple crops. GlobalCom$^2$ treats the tokens derived from the\nthumbnail as the \"commander\" of the entire token compression process, directing\nthe allocation of retention ratios and the specific compression for each crop.\nIn this way, redundant tokens are eliminated while important local details are\nadaptively preserved to the highest extent feasible. Empirical results across\n10 benchmarks reveal that GlobalCom$^2$ achieves an optimal balance between\nperformance and efficiency, and consistently outperforms state-of-the-art token\ncompression methods with LLaVA-NeXT-7B/13B models. Our code is released at\nhttps://github.com/xuyang-liu16/GlobalCom2.\n","authors":["Xuyang Liu","Ziming Wang","Yuhang Han","Yingyao Wang","Jiale Yuan","Jun Song","Bo Zheng","Linfeng Zhang","Siteng Huang","Honggang Chen"],"pdf_url":"https://arxiv.org/pdf/2501.05179v2.pdf","comment":"Our code is released at\n \\url{https://github.com/xuyang-liu16/GlobalCom2}"},{"id":"http://arxiv.org/abs/2501.08962v1","updated":"2025-01-15T17:18:46Z","published":"2025-01-15T17:18:46Z","title":"An analysis of data variation and bias in image-based dermatological\n datasets for machine learning classification","summary":" AI algorithms have become valuable in aiding professionals in healthcare. The\nincreasing confidence obtained by these models is helpful in critical decision\ndemands. In clinical dermatology, classification models can detect malignant\nlesions on patients' skin using only RGB images as input. However, most\nlearning-based methods employ data acquired from dermoscopic datasets on\ntraining, which are large and validated by a gold standard. Clinical models aim\nto deal with classification on users' smartphone cameras that do not contain\nthe corresponding resolution provided by dermoscopy. Also, clinical\napplications bring new challenges. It can contain captures from uncontrolled\nenvironments, skin tone variations, viewpoint changes, noises in data and\nlabels, and unbalanced classes. A possible alternative would be to use transfer\nlearning to deal with the clinical images. However, as the number of samples is\nlow, it can cause degradations on the model's performance; the source\ndistribution used in training differs from the test set. This work aims to\nevaluate the gap between dermoscopic and clinical samples and understand how\nthe dataset variations impact training. It assesses the main differences\nbetween distributions that disturb the model's prediction. Finally, from\nexperiments on different architectures, we argue how to combine the data from\ndivergent distributions, decreasing the impact on the model's final accuracy.\n","authors":["Francisco Mauro","Emanoel Thyago","Othon Vinicius","Rodrigo Abreu","Kelvin Cunha","José Gabriel","Rafael Barros","Thales Bezerra","Manoel Henriques","Natalia Lopes","Érico Moutinho","Jéssica Guido","Tsang Ing Ren","Paulo Borba"],"pdf_url":"https://arxiv.org/pdf/2501.08962v1.pdf","comment":"10 pages, 1 figure"},{"id":"http://arxiv.org/abs/2312.02186v3","updated":"2025-01-15T17:11:20Z","published":"2023-12-01T20:16:02Z","title":"Identifying Spurious Correlations using Counterfactual Alignment","summary":" Models driven by spurious correlations often yield poor generalization\nperformance. We propose the counterfactual (CF) alignment method to detect and\nquantify spurious correlations of black box classifiers. Our methodology is\nbased on counterfactual images generated with respect to one classifier being\ninput into other classifiers to see if they also induce changes in the outputs\nof these classifiers. The relationship between these responses can be\nquantified and used to identify specific instances where a spurious correlation\nexists. This is validated by observing intuitive trends in face-attribute and\nwaterbird classifiers, as well as by fabricating spurious correlations and\ndetecting their presence, both visually and quantitatively. Furthermore,\nutilizing the CF alignment method, we demonstrate that we can evaluate robust\noptimization methods (GroupDRO, JTT, and FLAC) by detecting a reduction in\nspurious correlations.\n","authors":["Joseph Paul Cohen","Louis Blankemeier","Akshay Chaudhari"],"pdf_url":"https://arxiv.org/pdf/2312.02186v3.pdf","comment":"Accepted to Transactions on Machine Learning Research (TMLR), Code:\n https://github.com/ieee8023/latentshift"},{"id":"http://arxiv.org/abs/2409.17137v4","updated":"2025-01-15T16:56:26Z","published":"2024-09-25T17:56:00Z","title":"PACE: Marrying generalization in PArameter-efficient fine-tuning with\n Consistency rEgularization","summary":" Parameter-Efficient Fine-Tuning (PEFT) effectively adapts pre-trained\ntransformers to downstream tasks. However, the optimization of tasks\nperformance often comes at the cost of generalizability in fine-tuned models.\nTo address this issue, we theoretically connect smaller weight gradient norms\nduring training and larger datasets to the improvements in model\ngeneralization. Motivated by this connection, we propose reducing gradient\nnorms for enhanced generalization and aligning fine-tuned model with the\npre-trained counterpart to retain knowledge from large-scale pre-training data.\nYet, naive alignment does not guarantee gradient reduction and can potentially\ncause gradient explosion, complicating efforts to manage gradients. To address\nsuch an issue, we propose PACE, marrying generalization of PArameter-efficient\nfine-tuning with Consistency rEgularization. We perturb features learned from\nthe adapter with the multiplicative noise and ensure the fine-tuned model\nremains consistent for same sample under different perturbations. Theoretical\nanalysis shows that PACE not only implicitly regularizes gradients for enhanced\ngeneralization, but also implicitly aligns the fine-tuned and pre-trained\nmodels to retain knowledge. Experimental evidence supports our theories. PACE\nsurpasses existing PEFT methods in visual adaptation tasks (VTAB-1k, FGVC,\nfew-shot learning, domain adaptation) showcasing its potential for\nresource-efficient fine-tuning. It also improves LoRA in text classification\n(GLUE) and mathematical reasoning (GSM-8K). The code is available at\nhttps://github.com/MaxwellYaoNi/PACE\n","authors":["Yao Ni","Shan Zhang","Piotr Koniusz"],"pdf_url":"https://arxiv.org/pdf/2409.17137v4.pdf","comment":"Accepted by NeurIPS 2024 as a spotlight"},{"id":"http://arxiv.org/abs/2412.14816v3","updated":"2025-01-15T16:54:36Z","published":"2024-12-19T13:10:03Z","title":"TextSleuth: Towards Explainable Tampered Text Detection","summary":" Recently, tampered text detection has attracted increasing attention due to\nits essential role in information security. Although existing methods can\ndetect the tampered text region, the interpretation of such detection remains\nunclear, making the prediction unreliable. To address this problem, we propose\nto explain the basis of tampered text detection with natural language via large\nmultimodal models. To fill the data gap for this task, we propose a\nlarge-scale, comprehensive dataset, ETTD, which contains both pixel-level\nannotations for tampered text region and natural language annotations\ndescribing the anomaly of the tampered text. Multiple methods are employed to\nimprove the quality of the proposed data. For example, elaborate queries are\nintroduced to generate high-quality anomaly descriptions with GPT4o. A fused\nmask prompt is proposed to reduce confusion when querying GPT4o to generate\nanomaly descriptions. To automatically filter out low-quality annotations, we\nalso propose to prompt GPT4o to recognize tampered texts before describing the\nanomaly, and to filter out the responses with low OCR accuracy. To further\nimprove explainable tampered text detection, we propose a simple yet effective\nmodel called TextSleuth, which achieves improved fine-grained perception and\ncross-domain generalization by focusing on the suspected region, with a\ntwo-stage analysis paradigm and an auxiliary grounding prompt. Extensive\nexperiments on both the ETTD dataset and the public dataset have verified the\neffectiveness of the proposed methods. In-depth analysis is also provided to\ninspire further research. Our dataset and code will be open-source.\n","authors":["Chenfan Qu","Jian Liu","Haoxing Chen","Baihan Yu","Jingjing Liu","Weiqiang Wang","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2412.14816v3.pdf","comment":"The first work for explainable tampered text detection"},{"id":"http://arxiv.org/abs/2501.08931v1","updated":"2025-01-15T16:34:20Z","published":"2025-01-15T16:34:20Z","title":"Visual WetlandBirds Dataset: Bird Species Identification and Behavior\n Recognition in Videos","summary":" The current biodiversity loss crisis makes animal monitoring a relevant field\nof study. In light of this, data collected through monitoring can provide\nessential insights, and information for decision-making aimed at preserving\nglobal biodiversity. Despite the importance of such data, there is a notable\nscarcity of datasets featuring videos of birds, and none of the existing\ndatasets offer detailed annotations of bird behaviors in video format. In\nresponse to this gap, our study introduces the first fine-grained video dataset\nspecifically designed for bird behavior detection and species classification.\nThis dataset addresses the need for comprehensive bird video datasets and\nprovides detailed data on bird actions, facilitating the development of deep\nlearning models to recognize these, similar to the advancements made in human\naction recognition. The proposed dataset comprises 178 videos recorded in\nSpanish wetlands, capturing 13 different bird species performing 7 distinct\nbehavior classes. In addition, we also present baseline results using state of\nthe art models on two tasks: bird behavior recognition and species\nclassification.\n","authors":["Javier Rodriguez-Juan","David Ortiz-Perez","Manuel Benavent-Lledo","David Mulero-Pérez","Pablo Ruiz-Ponce","Adrian Orihuela-Torres","Jose Garcia-Rodriguez","Esther Sebastián-González"],"pdf_url":"https://arxiv.org/pdf/2501.08931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08924v1","updated":"2025-01-15T16:30:05Z","published":"2025-01-15T16:30:05Z","title":"Learning Joint Denoising, Demosaicing, and Compression from the Raw\n Natural Image Noise Dataset","summary":" This paper introduces the Raw Natural Image Noise Dataset (RawNIND), a\ndiverse collection of paired raw images designed to support the development of\ndenoising models that generalize across sensors, image development workflows,\nand styles. Two denoising methods are proposed: one operates directly on raw\nBayer data, leveraging computational efficiency, while the other processes\nlinear RGB images for improved generalization to different sensors, with both\npreserving flexibility for subsequent development. Both methods outperform\ntraditional approaches which rely on developed images. Additionally, the\nintegration of denoising and compression at the raw data level significantly\nenhances rate-distortion performance and computational efficiency. These\nfindings suggest a paradigm shift toward raw data workflows for efficient and\nflexible image processing.\n","authors":["Benoit Brummer","Christophe De Vleeschouwer"],"pdf_url":"https://arxiv.org/pdf/2501.08924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08912v1","updated":"2025-01-15T16:20:26Z","published":"2025-01-15T16:20:26Z","title":"Empowering Agricultural Insights: RiceLeafBD - A Novel Dataset and\n Optimal Model Selection for Rice Leaf Disease Diagnosis through Transfer\n Learning Technique","summary":" The number of people living in this agricultural nation of ours, which is\nsurrounded by lush greenery, is growing on a daily basis. As a result of this,\nthe level of arable land is decreasing, as well as residential houses and\nindustrial factories. The food crisis is becoming the main threat for us in the\nupcoming days. Because on the one hand, the population is increasing, and on\nthe other hand, the amount of food crop production is decreasing due to the\nattack of diseases. Rice is one of the most significant cultivated crops since\nit provides food for more than half of the world's population. Bangladesh is\ndependent on rice (Oryza sativa) as a vital crop for its agriculture, but it\nfaces a significant problem as a result of the ongoing decline in rice yield\nbrought on by common diseases. Early disease detection is the main difficulty\nin rice crop cultivation. In this paper, we proposed our own dataset, which was\ncollected from the Bangladesh field, and also applied deep learning and\ntransfer learning models for the evaluation of the datasets. We elaborately\nexplain our dataset and also give direction for further research work to serve\nsociety using this dataset. We applied a light CNN model and pre-trained\nInceptionNet-V2, EfficientNet-V2, and MobileNet-V2 models, which achieved 91.5%\nperformance for the EfficientNet-V2 model of this work. The results obtained\nassaulted other models and even exceeded approaches that are considered to be\npart of the state of the art. It has been demonstrated by this study that it is\npossible to precisely and effectively identify diseases that affect rice leaves\nusing this unbiased datasets. After analysis of the performance of different\nmodels, the proposed datasets are significant for the society for research work\nto provide solutions for decreasing rice leaf disease.\n","authors":["Sadia Afrin Rimi","Md. Jalal Uddin Chowdhury","Rifat Abdullah","Iftekhar Ahmed","Mahrima Akter Mim","Mohammad Shoaib Rahman"],"pdf_url":"https://arxiv.org/pdf/2501.08912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08910v1","updated":"2025-01-15T16:19:37Z","published":"2025-01-15T16:19:37Z","title":"Lights, Camera, Matching: The Role of Image Illumination in Fair Face\n Recognition","summary":" Facial brightness is a key image quality factor impacting face recognition\naccuracy differentials across demographic groups. In this work, we aim to\ndecrease the accuracy gap between the similarity score distributions for\nCaucasian and African American female mated image pairs, as measured by d'\nbetween distributions. To balance brightness across demographic groups, we\nconduct three experiments, interpreting brightness in the face skin region\neither as median pixel value or as the distribution of pixel values. Balancing\nbased on median brightness alone yields up to a 46.8% decrease in d', while\nbalancing based on brightness distribution yields up to a 57.6% decrease. In\nall three cases, the similarity scores of the individual distributions improve,\nwith mean scores maximally improving 5.9% for Caucasian females and 3.7% for\nAfrican American females.\n","authors":["Gabriella Pangelinan","Grace Bezold","Haiyu Wu","Michael C. King","Kevin W. Bowyer"],"pdf_url":"https://arxiv.org/pdf/2501.08910v1.pdf","comment":"14 pages, 11 figures, Conference submission"},{"id":"http://arxiv.org/abs/2308.07898v2","updated":"2025-01-15T16:19:36Z","published":"2023-08-15T17:39:52Z","title":"A Foundation Language-Image Model of the Retina (FLAIR): Encoding Expert\n Knowledge in Text Supervision","summary":" Foundation vision-language models are currently transforming computer vision,\nand are on the rise in medical imaging fueled by their very promising\ngeneralization capabilities. However, the initial attempts to transfer this new\nparadigm to medical imaging have shown less impressive performances than those\nobserved in other domains, due to the significant domain shift and the complex,\nexpert domain knowledge inherent to medical-imaging tasks. Motivated by the\nneed for domain-expert foundation models, we present FLAIR, a pre-trained\nvision-language model for universal retinal fundus image understanding. To this\nend, we compiled 38 open-access, mostly categorical fundus imaging datasets\nfrom various sources, with up to 101 different target conditions and 288,307\nimages. We integrate the expert's domain knowledge in the form of descriptive\ntextual prompts, during both pre-training and zero-shot inference, enhancing\nthe less-informative categorical supervision of the data. Such a textual\nexpert's knowledge, which we compiled from the relevant clinical literature and\ncommunity standards, describes the fine-grained features of the pathologies as\nwell as the hierarchies and dependencies between them. We report comprehensive\nevaluations, which illustrate the benefit of integrating expert knowledge and\nthe strong generalization capabilities of FLAIR under difficult scenarios with\ndomain shifts or unseen categories. When adapted with a lightweight linear\nprobe, FLAIR outperforms fully-trained, dataset-focused models, more so in the\nfew-shot regimes. Interestingly, FLAIR outperforms by a wide margin\nlarger-scale generalist image-language models and retina domain-specific\nself-supervised networks, which emphasizes the potential of embedding experts'\ndomain knowledge and the limitations of generalist models in medical imaging.\n","authors":["Julio Silva-Rodríguez","Hadi Chakor","Riadh Kobbi","Jose Dolz","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2308.07898v2.pdf","comment":"Accepted in Medical Image Analysis. The pre-trained model is\n available at: https://github.com/jusiro/FLAIR"},{"id":"http://arxiv.org/abs/2501.08902v1","updated":"2025-01-15T16:11:24Z","published":"2025-01-15T16:11:24Z","title":"Multi-View Transformers for Airway-To-Lung Ratio Inference on Cardiac CT\n Scans: The C4R Study","summary":" The ratio of airway tree lumen to lung size (ALR), assessed at full\ninspiration on high resolution full-lung computed tomography (CT), is a major\nrisk factor for chronic obstructive pulmonary disease (COPD). There is growing\ninterest to infer ALR from cardiac CT images, which are widely available in\nepidemiological cohorts, to investigate the relationship of ALR to severe\nCOVID-19 and post-acute sequelae of SARS-CoV-2 infection (PASC). Previously,\ncardiac scans included approximately 2/3 of the total lung volume with 5-6x\ngreater slice thickness than high-resolution (HR) full-lung (FL) CT. In this\nstudy, we present a novel attention-based Multi-view Swin Transformer to infer\nFL ALR values from segmented cardiac CT scans. For the supervised training we\nexploit paired full-lung and cardiac CTs acquired in the Multi-Ethnic Study of\nAtherosclerosis (MESA). Our network significantly outperforms a proxy direct\nALR inference on segmented cardiac CT scans and achieves accuracy and\nreproducibility comparable with a scan-rescan reproducibility of the FL ALR\nground-truth.\n","authors":["Sneha N. Naik","Elsa D. Angelini","Eric A. Hoffman","Elizabeth C. Oelsner","R. Graham Barr","Benjamin M. Smith","Andrew F. Laine"],"pdf_url":"https://arxiv.org/pdf/2501.08902v1.pdf","comment":"Accepted to appear in Proceedings of International Symposium on\n Biomedical Imaging (ISBI), 2025"},{"id":"http://arxiv.org/abs/2501.08900v1","updated":"2025-01-15T16:08:25Z","published":"2025-01-15T16:08:25Z","title":"Enhanced Multi-Scale Cross-Attention for Person Image Generation","summary":" In this paper, we propose a novel cross-attention-based generative\nadversarial network (GAN) for the challenging person image generation task.\nCross-attention is a novel and intuitive multi-modal fusion method in which an\nattention/correlation matrix is calculated between two feature maps of\ndifferent modalities. Specifically, we propose the novel XingGAN (or\nCrossingGAN), which consists of two generation branches that capture the\nperson's appearance and shape, respectively. Moreover, we propose two novel\ncross-attention blocks to effectively transfer and update the person's shape\nand appearance embeddings for mutual improvement. This has not been considered\nby any other existing GAN-based image generation work. To further learn the\nlong-range correlations between different person poses at different scales and\nsub-regions, we propose two novel multi-scale cross-attention blocks. To tackle\nthe issue of independent correlation computations within the cross-attention\nmechanism leading to noisy and ambiguous attention weights, which hinder\nperformance improvements, we propose a module called enhanced attention (EA).\nLastly, we introduce a novel densely connected co-attention module to fuse\nappearance and shape features at different stages effectively. Extensive\nexperiments on two public datasets demonstrate that the proposed method\noutperforms current GAN-based methods and performs on par with diffusion-based\nmethods. However, our method is significantly faster than diffusion-based\nmethods in both training and inference.\n","authors":["Hao Tang","Ling Shao","Nicu Sebe","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2501.08900v1.pdf","comment":"Accepted to TPAMI, an extended version of a paper published in\n ECCV2020. arXiv admin note: substantial text overlap with arXiv:2007.09278"},{"id":"http://arxiv.org/abs/2501.08885v1","updated":"2025-01-15T15:56:06Z","published":"2025-01-15T15:56:06Z","title":"Feature-based One-For-All: A Universal Framework for Heterogeneous\n Knowledge Distillation","summary":" Knowledge distillation (KD) involves transferring knowledge from a\npre-trained heavy teacher model to a lighter student model, thereby reducing\nthe inference cost while maintaining comparable effectiveness. Prior KD\ntechniques typically assume homogeneity between the teacher and student models.\nHowever, as technology advances, a wide variety of architectures have emerged,\nranging from initial Convolutional Neural Networks (CNNs) to Vision\nTransformers (ViTs), and Multi-Level Perceptrons (MLPs). Consequently,\ndeveloping a universal KD framework compatible with any architecture has become\nan important research topic. In this paper, we introduce a feature-based\none-for-all (FOFA) KD framework to enable feature distillation across diverse\narchitecture. Our framework comprises two key components. First, we design\nprompt tuning blocks that incorporate student feedback, allowing teacher\nfeatures to adapt to the student model's learning process. Second, we propose\nregion-aware attention to mitigate the view mismatch problem between\nheterogeneous architecture. By leveraging these two modules, effective\ndistillation of intermediate features can be achieved across heterogeneous\narchitectures. Extensive experiments on CIFAR, ImageNet, and COCO demonstrate\nthe superiority of the proposed method.\n","authors":["Jhe-Hao Lin","Yi Yao","Chan-Feng Hsu","Hongxia Xie","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2501.08885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.20062v2","updated":"2025-01-15T15:53:13Z","published":"2024-12-28T07:34:49Z","title":"MADiff: Text-Guided Fashion Image Editing with Mask Prediction and\n Attention-Enhanced Diffusion","summary":" Text-guided image editing model has achieved great success in general domain.\nHowever, directly applying these models to the fashion domain may encounter two\nissues: (1) Inaccurate localization of editing region; (2) Weak editing\nmagnitude. To address these issues, the MADiff model is proposed. Specifically,\nto more accurately identify editing region, the MaskNet is proposed, in which\nthe foreground region, densepose and mask prompts from large language model are\nfed into a lightweight UNet to predict the mask for editing region. To\nstrengthen the editing magnitude, the Attention-Enhanced Diffusion Model is\nproposed, where the noise map, attention map, and the mask from MaskNet are fed\ninto the proposed Attention Processor to produce a refined noise map. By\nintegrating the refined noise map into the diffusion model, the edited image\ncan better align with the target prompt. Given the absence of benchmarks in\nfashion image editing, we constructed a dataset named Fashion-E, comprising\n28390 image-text pairs in the training set, and 2639 image-text pairs for four\ntypes of fashion tasks in the evaluation set. Extensive experiments on\nFashion-E demonstrate that our proposed method can accurately predict the mask\nof editing region and significantly enhance editing magnitude in fashion image\nediting compared to the state-of-the-art methods.\n","authors":["Zechao Zhan","Dehong Gao","Jinxia Zhang","Jiale Huang","Yang Hu","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2412.20062v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03492v6","updated":"2025-01-15T15:26:03Z","published":"2023-06-06T08:19:30Z","title":"Industrial Anomaly Detection and Localization Using Weakly-Supervised\n Residual Transformers","summary":" Recent advancements in industrial anomaly detection (AD) have demonstrated\nthat incorporating a small number of anomalous samples during training can\nsignificantly enhance accuracy. However, this improvement often comes at the\ncost of extensive annotation efforts, which are impractical for many real-world\napplications. In this paper, we introduce a novel framework, Weak}ly-supervised\nRESidual Transformer (WeakREST), designed to achieve high anomaly detection\naccuracy while minimizing the reliance on manual annotations. First, we\nreformulate the pixel-wise anomaly localization task into a block-wise\nclassification problem. Second, we introduce a residual-based feature\nrepresentation called Positional Fast Anomaly Residuals (PosFAR) which captures\nanomalous patterns more effectively. To leverage this feature, we adapt the\nSwin Transformer for enhanced anomaly detection and localization. Additionally,\nwe propose a weak annotation approach, utilizing bounding boxes and image tags\nto define anomalous regions. This approach establishes a semi-supervised\nlearning context that reduces the dependency on precise pixel-level labels. To\nfurther improve the learning process, we develop a novel ResMixMatch algorithm,\ncapable of handling the interplay between weak labels and residual-based\nrepresentations.\n On the benchmark dataset MVTec-AD, our method achieves an Average Precision\n(AP) of $83.0\\%$, surpassing the previous best result of $82.7\\%$ in the\nunsupervised setting. In the supervised AD setting, WeakREST attains an AP of\n$87.6\\%$, outperforming the previous best of $86.0\\%$. Notably, even when using\nweaker annotations such as bounding boxes, WeakREST exceeds the performance of\nleading methods relying on pixel-wise supervision, achieving an AP of $87.1\\%$\ncompared to the prior best of $86.0\\%$ on MVTec-AD.\n","authors":["Hanxi Li","Jingqi Wu","Deyin Liu","Lin Wu","Hao Chen","Mingwen Wang","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2306.03492v6.pdf","comment":"13 pages,7 figures"},{"id":"http://arxiv.org/abs/2411.10175v2","updated":"2025-01-15T15:24:32Z","published":"2024-11-15T13:21:26Z","title":"The Surprising Ineffectiveness of Pre-Trained Visual Representations for\n Model-Based Reinforcement Learning","summary":" Visual Reinforcement Learning (RL) methods often require extensive amounts of\ndata. As opposed to model-free RL, model-based RL (MBRL) offers a potential\nsolution with efficient data utilization through planning. Additionally, RL\nlacks generalization capabilities for real-world tasks. Prior work has shown\nthat incorporating pre-trained visual representations (PVRs) enhances sample\nefficiency and generalization. While PVRs have been extensively studied in the\ncontext of model-free RL, their potential in MBRL remains largely unexplored.\nIn this paper, we benchmark a set of PVRs on challenging control tasks in a\nmodel-based RL setting. We investigate the data efficiency, generalization\ncapabilities, and the impact of different properties of PVRs on the performance\nof model-based agents. Our results, perhaps surprisingly, reveal that for MBRL\ncurrent PVRs are not more sample efficient than learning representations from\nscratch, and that they do not generalize better to out-of-distribution (OOD)\nsettings. To explain this, we analyze the quality of the trained dynamics\nmodel. Furthermore, we show that data diversity and network architecture are\nthe most important contributors to OOD generalization performance.\n","authors":["Moritz Schneider","Robert Krug","Narunas Vaskevicius","Luigi Palmieri","Joschka Boedecker"],"pdf_url":"https://arxiv.org/pdf/2411.10175v2.pdf","comment":"Published at the 38th Conference on Neural Information Processing\n Systems (NeurIPS 2024). Project page: https://schneimo.com/pvr4mbrl/"},{"id":"http://arxiv.org/abs/2412.18977v2","updated":"2025-01-15T15:22:45Z","published":"2024-12-25T19:38:32Z","title":"CGCOD: Class-Guided Camouflaged Object Detection","summary":" Camouflaged Object Detection (COD) aims to identify objects that blend\nseamlessly into their surroundings. The inherent visual complexity of\ncamouflaged objects, including their low contrast with the background, diverse\ntextures, and subtle appearance variations, often obscures semantic cues,\nmaking accurate segmentation highly challenging. Existing methods primarily\nrely on visual features, which are insufficient to handle the variability and\nintricacy of camouflaged objects, leading to unstable object perception and\nambiguous segmentation results. To tackle these limitations, we introduce a\nnovel task, class-guided camouflaged object detection (CGCOD), which extends\ntraditional COD task by incorporating object-specific class knowledge to\nenhance detection robustness and accuracy. To facilitate this task, we present\na new dataset, CamoClass, comprising real-world camouflaged objects with class\nannotations. Furthermore, we propose a multi-stage framework, CGNet, which\nincorporates a plug-and-play class prompt generator and a simple yet effective\nclass-guided detector. This establishes a new paradigm for COD, bridging the\ngap between contextual understanding and class-guided detection. Extensive\nexperimental results demonstrate the effectiveness of our flexible framework in\nimproving the performance of proposed and existing detectors by leveraging\nclass-level textual information.\n","authors":["Chenxi Zhang","Qing Zhang","Jiayun Wu","Youwei Pang"],"pdf_url":"https://arxiv.org/pdf/2412.18977v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08861v1","updated":"2025-01-15T15:20:46Z","published":"2025-01-15T15:20:46Z","title":"Generative Planning with 3D-vision Language Pre-training for End-to-End\n Autonomous Driving","summary":" Autonomous driving is a challenging task that requires perceiving and\nunderstanding the surrounding environment for safe trajectory planning. While\nexisting vision-based end-to-end models have achieved promising results, these\nmethods are still facing the challenges of vision understanding, decision\nreasoning and scene generalization. To solve these issues, a generative\nplanning with 3D-vision language pre-training model named GPVL is proposed for\nend-to-end autonomous driving. The proposed paradigm has two significant\naspects. On one hand, a 3D-vision language pre-training module is designed to\nbridge the gap between visual perception and linguistic understanding in the\nbird's eye view. On the other hand, a cross-modal language model is introduced\nto generate holistic driving decisions and fine-grained trajectories with\nperception and navigation information in an auto-regressive manner. Experiments\non the challenging nuScenes dataset demonstrate that the proposed scheme\nachieves excellent performances compared with state-of-the-art methods.\nBesides, the proposed GPVL presents strong generalization ability and real-time\npotential when handling high-level commands in various scenarios. It is\nbelieved that the effective, robust and efficient performance of GPVL is\ncrucial for the practical application of future autonomous driving systems.\nCode is available at https://github.com/ltp1995/GPVL\n","authors":["Tengpeng Li","Hanli Wang","Xianfei Li","Wenlong Liao","Tao He","Pai Peng"],"pdf_url":"https://arxiv.org/pdf/2501.08861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08841v1","updated":"2025-01-15T14:52:20Z","published":"2025-01-15T14:52:20Z","title":"Exploring Task-Level Optimal Prompts for Visual In-Context Learning","summary":" With the development of Vision Foundation Models (VFMs) in recent years,\nVisual In-Context Learning (VICL) has become a better choice compared to\nmodifying models in most scenarios. Different from retraining or fine-tuning\nmodel, VICL does not require modifications to the model's weights or\narchitecture, and only needs a prompt with demonstrations to teach VFM how to\nsolve tasks. Currently, significant computational cost for finding optimal\nprompts for every test sample hinders the deployment of VICL, as determining\nwhich demonstrations to use for constructing prompts is very costly. In this\npaper, however, we find a counterintuitive phenomenon that most test samples\nactually achieve optimal performance under the same prompts, and searching for\nsample-level prompts only costs more time but results in completely identical\nprompts. Therefore, we propose task-level prompting to reduce the cost of\nsearching for prompts during the inference stage and introduce two time-saving\nyet effective task-level prompt search strategies. Extensive experimental\nresults show that our proposed method can identify near-optimal prompts and\nreach the best VICL performance with a minimal cost that prior work has never\nachieved.\n","authors":["Yan Zhu","Huan Ma","Changqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.08841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08837v1","updated":"2025-01-15T14:46:44Z","published":"2025-01-15T14:46:44Z","title":"MANTA: Diffusion Mamba for Efficient and Effective Stochastic Long-Term\n Dense Anticipation","summary":" Our work addresses the problem of stochastic long-term dense anticipation.\nThe goal of this task is to predict actions and their durations several minutes\ninto the future based on provided video observations. Anticipation over\nextended horizons introduces high uncertainty, as a single observation can lead\nto multiple plausible future outcomes. To address this uncertainty, stochastic\nmodels are designed to predict several potential future action sequences.\nRecent work has further proposed to incorporate uncertainty modelling for\nobserved frames by simultaneously predicting per-frame past and future actions\nin a unified manner. While such joint modelling of actions is beneficial, it\nrequires long-range temporal capabilities to connect events across distant past\nand future time points. However, the previous work struggles to achieve such a\nlong-range understanding due to its limited and/or sparse receptive field. To\nalleviate this issue, we propose a novel MANTA (MAmba for ANTicipation)\nnetwork. Our model enables effective long-term temporal modelling even for very\nlong sequences while maintaining linear complexity in sequence length. We\ndemonstrate that our approach achieves state-of-the-art results on three\ndatasets - Breakfast, 50Salads, and Assembly101 - while also significantly\nimproving computational and memory efficiency.\n","authors":["Olga Zatsarynna","Emad Bahrami","Yazan Abu Farha","Gianpiero Francesca","Juergen Gall"],"pdf_url":"https://arxiv.org/pdf/2501.08837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.16758v2","updated":"2025-01-15T14:35:11Z","published":"2024-12-21T20:16:37Z","title":"Evaluation of radiomic feature harmonization techniques for benign and\n malignant pulmonary nodules","summary":" BACKGROUND: Radiomics provides quantitative features of pulmonary nodules\n(PNs) which could aid lung cancer diagnosis, but medical image acquisition\nvariability is an obstacle to clinical application. Acquisition effects may\ndiffer between radiomic features from benign vs. malignant PNs. PURPOSE: We\nevaluated how to account for differences between benign and malignant PNs when\ncorrecting radiomic features' acquisition dependency. METHODS: We used 567\nchest CT scans grouped as benign, malignant, or lung cancer screening (mixed\nbenign, malignant). ComBat harmonization was applied to extracted features for\nvariation in 4 acquisition parameters. We compared: harmonizing without\ndistinction, harmonizing with a covariate to preserve distinctions between\nsubgroups, and harmonizing subgroups separately. Significant ($p\\le0.05$)\nKruskal-Wallis tests showed whether harmonization removed acquisition\ndependency. A LASSO-SVM pipeline was trained on successfully harmonized\nfeatures to predict malignancy. To evaluate predictive information in these\nfeatures, the trained harmonization estimators and predictive model were\napplied to unseen test sets. Harmonization and predictive performance were\nassessed for 10 trials of 5-fold cross-validation. RESULTS: An average 2.1% of\nfeatures (95% CI:1.9-2.4%) were acquisition-independent when harmonized without\ndistinction, 27.3% (95% CI:25.7-28.9%) when harmonized with a covariate, and\n90.9% (95% CI:90.4-91.5%) when harmonized separately. Data harmonized\nseparately or with a covariate trained models with higher ROC-AUC for screening\nscans than data harmonized without distinction between benign and malignant PNs\n(Delong test, adjusted $p\\le0.05$). CONCLUSIONS: Radiomic features of benign\nand malignant PNs need different corrective transformations to recover\nacquisition-independent distributions. This can be done by harmonizing\nseparately or with a covariate.\n","authors":["Claire Huchthausen","Menglin Shi","Gabriel L. A. de Sousa","Jonathan Colen","Emery Shelley","James Larner","Einsley Janowski","Krishni Wijesooriya"],"pdf_url":"https://arxiv.org/pdf/2412.16758v2.pdf","comment":"15 pages, 3 figures, plus supplemental material; updated author list,\n corrected result in paragraph 3 of Discussion, updated Figure S1"},{"id":"http://arxiv.org/abs/2501.08828v1","updated":"2025-01-15T14:30:13Z","published":"2025-01-15T14:30:13Z","title":"MMDocIR: Benchmarking Multi-Modal Retrieval for Long Documents","summary":" Multi-modal document retrieval is designed to identify and retrieve various\nforms of multi-modal content, such as figures, tables, charts, and layout\ninformation from extensive documents. Despite its significance, there is a\nnotable lack of a robust benchmark to effectively evaluate the performance of\nsystems in multi-modal document retrieval. To address this gap, this work\nintroduces a new benchmark, named as MMDocIR, encompassing two distinct tasks:\npage-level and layout-level retrieval. The former focuses on localizing the\nmost relevant pages within a long document, while the latter targets the\ndetection of specific layouts, offering a more fine-grained granularity than\nwhole-page analysis. A layout can refer to a variety of elements such as\ntextual paragraphs, equations, figures, tables, or charts. The MMDocIR\nbenchmark comprises a rich dataset featuring expertly annotated labels for\n1,685 questions and bootstrapped labels for 173,843 questions, making it a\npivotal resource for advancing multi-modal document retrieval for both training\nand evaluation. Through rigorous experiments, we reveal that (i) visual\nretrievers significantly outperform their text counterparts, (ii) MMDocIR train\nset can effectively benefit the training process of multi-modal document\nretrieval and (iii) text retrievers leveraging on VLM-text perform much better\nthan those using OCR-text. These findings underscores the potential advantages\nof integrating visual elements for multi-modal document retrieval.\n","authors":["Kuicai Dong","Yujing Chang","Xin Deik Goh","Dexun Li","Ruiming Tang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2501.08828v1.pdf","comment":"https://huggingface.co/MMDocIR"},{"id":"http://arxiv.org/abs/2501.08819v1","updated":"2025-01-15T14:17:13Z","published":"2025-01-15T14:17:13Z","title":"Boosting Diffusion Guidance via Learning Degradation-Aware Models for\n Blind Super Resolution","summary":" Recently, diffusion-based blind super-resolution (SR) methods have shown\ngreat ability to generate high-resolution images with abundant high-frequency\ndetail, but the detail is often achieved at the expense of fidelity. Meanwhile,\nanother line of research focusing on rectifying the reverse process of\ndiffusion models (i.e., diffusion guidance), has demonstrated the power to\ngenerate high-fidelity results for non-blind SR. However, these methods rely on\nknown degradation kernels, making them difficult to apply to blind SR. To\naddress these issues, we introduce degradation-aware models that can be\nintegrated into the diffusion guidance framework, eliminating the need to know\ndegradation kernels. Additionally, we propose two novel techniques input\nperturbation and guidance scalar to further improve our performance. Extensive\nexperimental results show that our proposed method has superior performance\nover state-of-the-art methods on blind SR benchmarks\n","authors":["Shao-Hao Lu","Ren Wang","Ching-Chun Huang","Wei-Chen Chiu"],"pdf_url":"https://arxiv.org/pdf/2501.08819v1.pdf","comment":"To appear in WACV 2025. Code is available at:\n https://github.com/ryanlu2240/Boosting-Diffusion-Guidance-via-Learning-Degradation-Aware-Models-for-Blind-Super-Resolution"},{"id":"http://arxiv.org/abs/2501.08816v1","updated":"2025-01-15T14:12:59Z","published":"2025-01-15T14:12:59Z","title":"IDEA: Image Description Enhanced CLIP-Adapter","summary":" CLIP (Contrastive Language-Image Pre-training) has attained great success in\npattern recognition and computer vision. Transferring CLIP to downstream tasks\n(e.g. zero- or few-shot classification) is a hot topic in multimodal learning.\nHowever, current studies primarily focus on either prompt learning for text or\nadapter tuning for vision, without fully exploiting the complementary\ninformation and correlations among image-text pairs. In this paper, we propose\nan Image Description Enhanced CLIP-Adapter (IDEA) method to adapt CLIP to\nfew-shot image classification tasks. This method captures fine-grained features\nby leveraging both visual features and textual descriptions of images. IDEA is\na training-free method for CLIP, and it can be comparable to or even exceeds\nstate-of-the-art models on multiple tasks. Furthermore, we introduce\nTrainable-IDEA (T-IDEA), which extends IDEA by adding two lightweight learnable\ncomponents (i.e., a projector and a learnable latent space), further enhancing\nthe model's performance and achieving SOTA results on 11 datasets. As one\nimportant contribution, we employ the Llama model and design a comprehensive\npipeline to generate textual descriptions for images of 11 datasets, resulting\nin a total of 1,637,795 image-text pairs, named \"IMD-11\". Our code and data are\nreleased at https://github.com/FourierAI/IDEA.\n","authors":["Zhipeng Ye","Feng Jiang","Qiufeng Wang","Kaizhu Huang","Jiaqi Huang"],"pdf_url":"https://arxiv.org/pdf/2501.08816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08815v1","updated":"2025-01-15T14:12:55Z","published":"2025-01-15T14:12:55Z","title":"Human Pose-Constrained UV Map Estimation","summary":" UV map estimation is used in computer vision for detailed analysis of human\nposture or activity. Previous methods assign pixels to body model vertices by\ncomparing pixel descriptors independently, without enforcing global coherence\nor plausibility in the UV map. We propose Pose-Constrained Continuous Surface\nEmbeddings (PC-CSE), which integrates estimated 2D human pose into the\npixel-to-vertex assignment process. The pose provides global anatomical\nconstraints, ensuring that UV maps remain coherent while preserving local\nprecision. Evaluation on DensePose COCO demonstrates consistent improvement,\nregardless of the chosen 2D human pose model. Whole-body poses offer better\nconstraints by incorporating additional details about the hands and feet.\nConditioning UV maps with human pose reduces invalid mappings and enhances\nanatomical plausibility. In addition, we highlight inconsistencies in the\nground-truth annotations.\n","authors":["Matej Suchanek","Miroslav Purkrabek","Jiri Matas"],"pdf_url":"https://arxiv.org/pdf/2501.08815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08807v1","updated":"2025-01-15T14:03:27Z","published":"2025-01-15T14:03:27Z","title":"Multi-visual modality micro drone-based structural damage detection","summary":" Accurate detection and resilience of object detectors in structural damage\ndetection are important in ensuring the continuous use of civil infrastructure.\nHowever, achieving robustness in object detectors remains a persistent\nchallenge, impacting their ability to generalize effectively. This study\nproposes DetectorX, a robust framework for structural damage detection coupled\nwith a micro drone. DetectorX addresses the challenges of object detector\nrobustness by incorporating two innovative modules: a stem block and a spiral\npooling technique. The stem block introduces a dynamic visual modality by\nleveraging the outputs of two Deep Convolutional Neural Network (DCNN) models.\nThe framework employs the proposed event-based reward reinforcement learning to\nconstrain the actions of a parent and child DCNN model leading to a reward.\nThis results in the induction of two dynamic visual modalities alongside the\nRed, Green, and Blue (RGB) data. This enhancement significantly augments\nDetectorX's perception and adaptability in diverse environmental situations.\nFurther, a spiral pooling technique, an online image augmentation method,\nstrengthens the framework by increasing feature representations by\nconcatenating spiraled and average/max pooled features. In three extensive\nexperiments: (1) comparative and (2) robustness, which use the Pacific\nEarthquake Engineering Research Hub ImageNet dataset, and (3) field-experiment,\nDetectorX performed satisfactorily across varying metrics, including precision\n(0.88), recall (0.84), average precision (0.91), mean average precision (0.76),\nand mean average recall (0.73), compared to the competing detectors including\nYou Only Look Once X-medium (YOLOX-m) and others. The study's findings indicate\nthat DetectorX can provide satisfactory results and demonstrate resilience in\nchallenging environments.\n","authors":["Isaac Osei Agyemanga","Liaoyuan Zeng","Jianwen Chena","Isaac Adjei-Mensah","Daniel Acheampong"],"pdf_url":"https://arxiv.org/pdf/2501.08807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19694v2","updated":"2025-01-15T13:53:29Z","published":"2024-07-29T04:33:04Z","title":"Structural damage detection via hierarchical damage information with\n volumetric assessment","summary":" Structural health monitoring (SHM) is essential for ensuring the safety and\nlongevity of infrastructure, but complex image environments, noisy labels, and\nreliance on manual damage assessments often hinder its effectiveness. This\nstudy introduces the Guided Detection Network (Guided-DetNet), a framework\ndesigned to address these challenges. Guided-DetNet is characterized by a\nGenerative Attention Module (GAM), Hierarchical Elimination Algorithm (HEA),\nand Volumetric Contour Visual Assessment (VCVA). GAM leverages cross-horizontal\nand cross-vertical patch merging and cross-foreground-background feature fusion\nto generate varied features to mitigate complex image environments. HEA\naddresses noisy labeling using hierarchical relationships among classes to\nrefine instances given an image by eliminating unlikely class instances. VCVA\nassesses the severity of detected damages via volumetric representation and\nquantification leveraging the Dirac delta distribution. A comprehensive\nquantitative study and two robustness tests were conducted using the PEER Hub\ndataset, and a drone-based application, which involved a field experiment, was\nconducted to substantiate Guided-DetNet's promising performances. In triple\nclassification tasks, the framework achieved 96% accuracy, surpassing\nstate-of-the-art classifiers by up to 3%. In dual detection tasks, it\noutperformed competitive detectors with a precision of 94% and a mean average\nprecision (mAP) of 79% while maintaining a frame rate of 57.04fps, suitable for\nreal-time applications. Additionally, robustness tests demonstrated resilience\nunder adverse conditions, with precision scores ranging from 79% to 91%.\nGuided-DetNet is established as a robust and efficient framework for SHM,\noffering advancements in automation and precision, with the potential for\nwidespread application in drone-based infrastructure inspections.\n","authors":["Isaac Osei Agyemang","Isaac Adjei-Mensah","Daniel Acheampong","Gordon Owusu Boateng","Adu Asare Baffour"],"pdf_url":"https://arxiv.org/pdf/2407.19694v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08799v1","updated":"2025-01-15T13:46:33Z","published":"2025-01-15T13:46:33Z","title":"Exploring ChatGPT for Face Presentation Attack Detection in Zero and\n Few-Shot in-Context Learning","summary":" This study highlights the potential of ChatGPT (specifically GPT-4o) as a\ncompetitive alternative for Face Presentation Attack Detection (PAD),\noutperforming several PAD models, including commercial solutions, in specific\nscenarios. Our results show that GPT-4o demonstrates high consistency,\nparticularly in few-shot in-context learning, where its performance improves as\nmore examples are provided (reference data). We also observe that detailed\nprompts enable the model to provide scores reliably, a behavior not observed\nwith concise prompts. Additionally, explanation-seeking prompts slightly\nenhance the model's performance by improving its interpretability. Remarkably,\nthe model exhibits emergent reasoning capabilities, correctly predicting the\nattack type (print or replay) with high accuracy in few-shot scenarios, despite\nnot being explicitly instructed to classify attack types. Despite these\nstrengths, GPT-4o faces challenges in zero-shot tasks, where its performance is\nlimited compared to specialized PAD systems. Experiments were conducted on a\nsubset of the SOTERIA dataset, ensuring compliance with data privacy\nregulations by using only data from consenting individuals. These findings\nunderscore GPT-4o's promise in PAD applications, laying the groundwork for\nfuture research to address broader data privacy concerns and improve\ncross-dataset generalization. Code available here:\nhttps://gitlab.idiap.ch/bob/bob.paper.wacv2025_chatgpt_face_pad\n","authors":["Alain Komaty","Hatef Otroshi Shahreza","Anjith George","Sebastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2501.08799v1.pdf","comment":"Accepted in WACV workshop 2025"},{"id":"http://arxiv.org/abs/2412.16563v2","updated":"2025-01-15T13:34:12Z","published":"2024-12-21T10:16:07Z","title":"SemTalk: Holistic Co-speech Motion Generation with Frame-level Semantic\n Emphasis","summary":" A good co-speech motion generation cannot be achieved without a careful\nintegration of common rhythmic motion and rare yet essential semantic motion.\nIn this work, we propose SemTalk for holistic co-speech motion generation with\nframe-level semantic emphasis. Our key insight is to separately learn general\nmotions and sparse motions, and then adaptively fuse them. In particular,\nrhythmic consistency learning is explored to establish rhythm-related base\nmotion, ensuring a coherent foundation that synchronizes gestures with the\nspeech rhythm. Subsequently, textit{semantic emphasis learning is designed to\ngenerate semantic-aware sparse motion, focusing on frame-level semantic cues.\nFinally, to integrate sparse motion into the base motion and generate\nsemantic-emphasized co-speech gestures, we further leverage a learned semantic\nscore for adaptive synthesis. Qualitative and quantitative comparisons on two\npublic datasets demonstrate that our method outperforms the state-of-the-art,\ndelivering high-quality co-speech motion with enhanced semantic richness over a\nstable base motion.\n","authors":["Xiangyue Zhang","Jianfang Li","Jiaxu Zhang","Ziqiang Dang","Jianqiang Ren","Liefeng Bo","Zhigang Tu"],"pdf_url":"https://arxiv.org/pdf/2412.16563v2.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2501.02487v3","updated":"2025-01-15T13:07:56Z","published":"2025-01-05T09:40:58Z","title":"ACE++: Instruction-Based Image Creation and Editing via Context-Aware\n Content Filling","summary":" We report ACE++, an instruction-based diffusion framework that tackles\nvarious image generation and editing tasks. Inspired by the input format for\nthe inpainting task proposed by FLUX.1-Fill-dev, we improve the Long-context\nCondition Unit (LCU) introduced in ACE and extend this input paradigm to any\nediting and generation tasks. To take full advantage of image generative\npriors, we develop a two-stage training scheme to minimize the efforts of\nfinetuning powerful text-to-image diffusion models like FLUX.1-dev. In the\nfirst stage, we pre-train the model using task data with the 0-ref tasks from\nthe text-to-image model. There are many models in the community based on the\npost-training of text-to-image foundational models that meet this training\nparadigm of the first stage. For example, FLUX.1-Fill-dev deals primarily with\npainting tasks and can be used as an initialization to accelerate the training\nprocess. In the second stage, we finetune the above model to support the\ngeneral instructions using all tasks defined in ACE. To promote the widespread\napplication of ACE++ in different scenarios, we provide a comprehensive set of\nmodels that cover both full finetuning and lightweight finetuning, while\nconsidering general applicability and applicability in vertical scenarios. The\nqualitative analysis showcases the superiority of ACE++ in terms of generating\nimage quality and prompt following ability. Code and models will be available\non the project page: https://ali-vilab. github.io/ACE_plus_page/.\n","authors":["Chaojie Mao","Jingfeng Zhang","Yulin Pan","Zeyinzi Jiang","Zhen Han","Yu Liu","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.02487v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07047v2","updated":"2025-01-15T12:47:35Z","published":"2024-05-11T16:30:39Z","title":"Solving Energy-Independent Density for CT Metal Artifact Reduction via\n Neural Representation","summary":" X-ray CT often suffers from shadowing and streaking artifacts in the presence\nof metallic materials, which severely degrade imaging quality. Physically, the\nlinear attenuation coefficients (LACs) of metals vary significantly with X-ray\nenergy, causing a nonlinear beam hardening effect (BHE) in CT measurements.\nReconstructing CT images from metal-corrupted measurements consequently becomes\na challenging nonlinear inverse problem. Existing state-of-the-art (SOTA) metal\nartifact reduction (MAR) algorithms rely on supervised learning with numerous\npaired CT samples. While promising, these supervised methods often assume that\nthe unknown LACs are energy-independent, ignoring the energy-induced BHE, which\nresults in limited generalization. Moreover, the requirement for large datasets\nalso limits their applications in real-world scenarios. In this work, we\npropose Density neural representation (Diner), a novel unsupervised MAR method.\nOur key innovation lies in formulating MAR as an energy-independent density\nreconstruction problem that strictly adheres to the photon-tissue absorption\nphysical model. This model is inherently nonlinear and complex, making it a\nrarely considered approach in inverse imaging problems. By introducing the\nwater-equivalent tissues approximation and a new polychromatic model to\ncharacterize the nonlinear CT acquisition process, we directly learn the neural\nrepresentation of the density map from raw measurements without using external\ntraining data. This energy-independent density reconstruction framework\nfundamentally resolves the nonlinear BHE, enabling superior MAR performance\nacross a wide range of scanning scenarios. Extensive experiments on both\nsimulated and real-world datasets demonstrate the superiority of our\nunsupervised Diner over popular supervised methods in terms of MAR performance\nand robustness.\n","authors":["Qing Wu","Xu Guo","Lixuan Chen","Yanyan Liu","Dongming He","Xudong Wang","Xueli Chen","Yifeng Zhang","S. Kevin Zhou","Jingyi Yu","Yuyao Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.07047v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2312.17345v2","updated":"2025-01-15T12:46:07Z","published":"2023-12-28T20:26:03Z","title":"3VL: Using Trees to Improve Vision-Language Models' Interpretability","summary":" Vision-Language models (VLMs) have proven to be effective at aligning image\nand text representations, producing superior zero-shot results when transferred\nto many downstream tasks. However, these representations suffer from some key\nshortcomings in understanding Compositional Language Concepts (CLC), such as\nrecognizing objects' attributes, states, and relations between different\nobjects. Moreover, VLMs typically have poor interpretability, making it\nchallenging to debug and mitigate compositional-understanding failures. In this\nwork, we introduce the architecture and training technique of Tree-augmented\nVision-Language (3VL) model accompanied by our proposed Anchor inference method\nand Differential Relevance (DiRe) interpretability tool. By expanding the text\nof an arbitrary image-text pair into a hierarchical tree structure using\nlanguage analysis tools, 3VL allows the induction of this structure into the\nvisual representation learned by the model, enhancing its interpretability and\ncompositional reasoning. Additionally, we show how Anchor, a simple technique\nfor text unification, can be used to filter nuisance factors while increasing\nCLC understanding performance, e.g., on the fundamental VL-Checklist benchmark.\nWe also show how DiRe, which performs a differential comparison between VLM\nrelevancy maps, enables us to generate compelling visualizations of the reasons\nfor a model's success or failure. Our code is available at:\nhttps://github.com/niryellinek/3VL.\n","authors":["Nir Yellinek","Leonid Karlinsky","Raja Giryes"],"pdf_url":"https://arxiv.org/pdf/2312.17345v2.pdf","comment":"accepted to IEEE TIP"},{"id":"http://arxiv.org/abs/2501.08771v1","updated":"2025-01-15T12:44:52Z","published":"2025-01-15T12:44:52Z","title":"Admitting Ignorance Helps the Video Question Answering Models to Answer","summary":" Significant progress has been made in the field of video question answering\n(VideoQA) thanks to deep learning and large-scale pretraining. Despite the\npresence of sophisticated model structures and powerful video-text foundation\nmodels, most existing methods focus solely on maximizing the correlation\nbetween answers and video-question pairs during training. We argue that these\nmodels often establish shortcuts, resulting in spurious correlations between\nquestions and answers, especially when the alignment between video and text\ndata is suboptimal. To address these spurious correlations, we propose a novel\ntraining framework in which the model is compelled to acknowledge its ignorance\nwhen presented with an intervened question, rather than making guesses solely\nbased on superficial question-answer correlations. We introduce methodologies\nfor intervening in questions, utilizing techniques such as displacement and\nperturbation, and design frameworks for the model to admit its lack of\nknowledge in both multi-choice VideoQA and open-ended settings. In practice, we\nintegrate a state-of-the-art model into our framework to validate its\neffectiveness. The results clearly demonstrate that our framework can\nsignificantly enhance the performance of VideoQA models with minimal structural\nmodifications.\n","authors":["Haopeng Li","Tom Drummond","Mingming Gong","Mohammed Bennamoun","Qiuhong Ke"],"pdf_url":"https://arxiv.org/pdf/2501.08771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06406v2","updated":"2025-01-15T12:36:24Z","published":"2024-03-11T03:35:41Z","title":"When No-Reference Image Quality Models Meet MAP Estimation in Diffusion\n Latents","summary":" Contemporary no-reference image quality assessment (NR-IQA) models can\neffectively quantify perceived image quality, often achieving strong\ncorrelations with human perceptual scores on standard IQA benchmarks. Yet,\nlimited efforts have been devoted to treating NR-IQA models as natural image\npriors for real-world image enhancement, and consequently comparing them from a\nperceptual optimization standpoint. In this work, we show -- for the first time\n-- that NR-IQA models can be plugged into the maximum a posteriori (MAP)\nestimation framework for image enhancement. This is achieved by performing\ngradient ascent in the diffusion latent space rather than in the raw pixel\ndomain, leveraging a pretrained differentiable and bijective diffusion process.\nLikely, different NR-IQA models lead to different enhanced outputs, which in\nturn provides a new computational means of comparing them. Unlike conventional\ncorrelation-based measures, our comparison method offers complementary insights\ninto the respective strengths and weaknesses of the competing NR-IQA models in\nperceptual optimization scenarios. Additionally, we aim to improve the\nbest-performing NR-IQA model in diffusion latent MAP estimation by\nincorporating the advantages of other top-performing methods. The resulting\nmodel delivers noticeably better results in enhancing real-world images\nafflicted by unknown and complex distortions, all preserving a high degree of\nimage fidelity.\n","authors":["Weixia Zhang","Dingquan Li","Guangtao Zhai","Xiaokang Yang","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2403.06406v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08763v1","updated":"2025-01-15T12:33:11Z","published":"2025-01-15T12:33:11Z","title":"Few-Shot Learner Generalizes Across AI-Generated Image Detection","summary":" Current fake image detectors trained on large synthetic image datasets\nperform satisfactorily on limited studied generative models. However, they\nsuffer a notable performance decline over unseen models. Besides, collecting\nadequate training data from online generative models is often expensive or\ninfeasible. To overcome these issues, we propose Few-Shot Detector (FSD), a\nnovel AI-generated image detector which learns a specialized metric space to\neffectively distinguish unseen fake images by utilizing very few samples.\nExperiments show FSD achieves state-of-the-art performance by $+7.4\\%$ average\nACC on GenImage dataset. More importantly, our method is better capable of\ncapturing the intra-category common features in unseen images without further\ntraining.\n","authors":["Shiyu Wu","Jing Liu","Jing Li","Yequan Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08763v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.01505v4","updated":"2025-01-15T12:31:57Z","published":"2024-01-03T02:22:34Z","title":"Sports-QA: A Large-Scale Video Question Answering Benchmark for Complex\n and Professional Sports","summary":" Reasoning over sports videos for question answering is an important task with\nnumerous applications, such as player training and information retrieval.\nHowever, this task has not been explored due to the lack of relevant datasets\nand the challenging nature it presents. Most datasets for video question\nanswering (VideoQA) focus mainly on general and coarse-grained understanding of\ndaily-life videos, which is not applicable to sports scenarios requiring\nprofessional action understanding and fine-grained motion analysis. In this\npaper, we introduce the first dataset, named Sports-QA, specifically designed\nfor the sports VideoQA task. The Sports-QA dataset includes various types of\nquestions, such as descriptions, chronologies, causalities, and counterfactual\nconditions, covering multiple sports. Furthermore, to address the\ncharacteristics of the sports VideoQA task, we propose a new Auto-Focus\nTransformer (AFT) capable of automatically focusing on particular scales of\ntemporal information for question answering. We conduct extensive experiments\non Sports-QA, including baseline studies and the evaluation of different\nmethods. The results demonstrate that our AFT achieves state-of-the-art\nperformance.\n","authors":["Haopeng Li","Andong Deng","Jun Liu","Hossein Rahmani","Yulan Guo","Bernt Schiele","Mohammed Bennamoun","Qiuhong Ke"],"pdf_url":"https://arxiv.org/pdf/2401.01505v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08002v2","updated":"2025-01-15T11:52:29Z","published":"2025-01-14T10:46:41Z","title":"Maximizing Uncertainty for Federated learning via Bayesian\n Optimisation-based Model Poisoning","summary":" As we transition from Narrow Artificial Intelligence towards Artificial Super\nIntelligence, users are increasingly concerned about their privacy and the\ntrustworthiness of machine learning (ML) technology. A common denominator for\nthe metrics of trustworthiness is the quantification of uncertainty inherent in\nDL algorithms, and specifically in the model parameters, input data, and model\npredictions. One of the common approaches to address privacy-related issues in\nDL is to adopt distributed learning such as federated learning (FL), where\nprivate raw data is not shared among users. Despite the privacy-preserving\nmechanisms in FL, it still faces challenges in trustworthiness. Specifically,\nthe malicious users, during training, can systematically create malicious model\nparameters to compromise the models predictive and generative capabilities,\nresulting in high uncertainty about their reliability. To demonstrate malicious\nbehaviour, we propose a novel model poisoning attack method named Delphi which\naims to maximise the uncertainty of the global model output. We achieve this by\ntaking advantage of the relationship between the uncertainty and the model\nparameters of the first hidden layer of the local model. Delphi employs two\ntypes of optimisation , Bayesian Optimisation and Least Squares Trust Region,\nto search for the optimal poisoned model parameters, named as Delphi-BO and\nDelphi-LSTR. We quantify the uncertainty using the KL Divergence to minimise\nthe distance of the predictive probability distribution towards an uncertain\ndistribution of model output. Furthermore, we establish a mathematical proof\nfor the attack effectiveness demonstrated in FL. Numerical results demonstrate\nthat Delphi-BO induces a higher amount of uncertainty than Delphi-LSTR\nhighlighting vulnerability of FL systems to model poisoning attacks.\n","authors":["Marios Aristodemou","Xiaolan Liu","Yuan Wang","Konstantinos G. Kyriakopoulos","Sangarapillai Lambotharan","Qingsong Wei"],"pdf_url":"https://arxiv.org/pdf/2501.08002v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2402.12238v2","updated":"2025-01-15T11:52:13Z","published":"2024-02-19T15:48:55Z","title":"MGF: Mixed Gaussian Flow for Diverse Trajectory Prediction","summary":" To predict future trajectories, the normalizing flow with a standard Gaussian\nprior suffers from weak diversity. The ineffectiveness comes from the conflict\nbetween the fact of asymmetric and multi-modal distribution of likely outcomes\nand symmetric and single-modal original distribution and supervision losses.\nInstead, we propose constructing a mixed Gaussian prior for a normalizing flow\nmodel for trajectory prediction. The prior is constructed by analyzing the\ntrajectory patterns in the training samples without requiring extra annotations\nwhile showing better expressiveness and being multi-modal and asymmetric.\nBesides diversity, it also provides better controllability for probabilistic\ntrajectory generation. We name our method Mixed Gaussian Flow (MGF). It\nachieves state-of-the-art performance in the evaluation of both trajectory\nalignment and diversity on the popular UCY/ETH and SDD datasets. Code is\navailable at https://github.com/mulplue/MGF.\n","authors":["Jiahe Chen","Jinkun Cao","Dahua Lin","Kris Kitani","Jiangmiao Pang"],"pdf_url":"https://arxiv.org/pdf/2402.12238v2.pdf","comment":"Accepted by Neurips 2024. Code: https://github.com/mulplue/MGF"},{"id":"http://arxiv.org/abs/2407.11664v3","updated":"2025-01-15T11:51:19Z","published":"2024-07-16T12:36:26Z","title":"Mask-guided cross-image attention for zero-shot in-silico\n histopathologic image generation with a diffusion model","summary":" Creating in-silico data with generative AI promises a cost-effective\nalternative to staining, imaging, and annotating whole slide images in\ncomputational pathology. Diffusion models are the state-of-the-art solution for\ngenerating in-silico images, offering unparalleled fidelity and realism. Using\nappearance transfer diffusion models allows for zero-shot image generation,\nfacilitating fast application and making model training unnecessary. However\ncurrent appearance transfer diffusion models are designed for natural images,\nwhere the main task is to transfer the foreground object from an origin to a\ntarget domain, while the background is of insignificant importance. In\ncomputational pathology, specifically in oncology, it is however not\nstraightforward to define which objects in an image should be classified as\nforeground and background, as all objects in an image may be of critical\nimportance for the detailed understanding the tumor micro-environment. We\ncontribute to the applicability of appearance transfer diffusion models to\nimmunohistochemistry-stained images by modifying the appearance transfer\nguidance to alternate between class-specific AdaIN feature statistics matchings\nusing existing segmentation masks. The performance of the proposed method is\ndemonstrated on the downstream task of supervised epithelium segmentation,\nshowing that the number of manual annotations required for model training can\nbe reduced by 75%, outperforming the baseline approach. Additionally, we\nconsulted with a certified pathologist to investigate future improvements. We\nanticipate this work to inspire the application of zero-shot diffusion models\nin computational pathology, providing an efficient method to generate in-silico\nimages with unmatched fidelity and realism, which prove meaningful for\ndownstream tasks, such as training existing deep learning models or finetuning\nfoundation models.\n","authors":["Dominik Winter","Nicolas Triltsch","Marco Rosati","Anatoliy Shumilov","Ziya Kokaragac","Yuri Popov","Thomas Padel","Laura Sebastian Monasor","Ross Hill","Markus Schick","Nicolas Brieu"],"pdf_url":"https://arxiv.org/pdf/2407.11664v3.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2501.08717v1","updated":"2025-01-15T10:58:32Z","published":"2025-01-15T10:58:32Z","title":"$\\texttt{InfoHier}$: Hierarchical Information Extraction via Encoding\n and Embedding","summary":" Analyzing large-scale datasets, especially involving complex and\nhigh-dimensional data like images, is particularly challenging. While\nself-supervised learning (SSL) has proven effective for learning\nrepresentations from unlabelled data, it typically focuses on flat,\nnon-hierarchical structures, missing the multi-level relationships present in\nmany real-world datasets. Hierarchical clustering (HC) can uncover these\nrelationships by organizing data into a tree-like structure, but it often\nrelies on rigid similarity metrics that struggle to capture the complexity of\ndiverse data types. To address these we envision $\\texttt{InfoHier}$, a\nframework that combines SSL with HC to jointly learn robust latent\nrepresentations and hierarchical structures. This approach leverages SSL to\nprovide adaptive representations, enhancing HC's ability to capture complex\npatterns. Simultaneously, it integrates HC loss to refine SSL training,\nresulting in representations that are more attuned to the underlying\ninformation hierarchy. $\\texttt{InfoHier}$ has the potential to improve the\nexpressiveness and performance of both clustering and representation learning,\noffering significant benefits for data analysis, management, and information\nretrieval.\n","authors":["Tianru Zhang","Li Ju","Prashant Singh","Salman Toor"],"pdf_url":"https://arxiv.org/pdf/2501.08717v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2501.08712v1","updated":"2025-01-15T10:54:21Z","published":"2025-01-15T10:54:21Z","title":"Self-supervised Transformation Learning for Equivariant Representations","summary":" Unsupervised representation learning has significantly advanced various\nmachine learning tasks. In the computer vision domain, state-of-the-art\napproaches utilize transformations like random crop and color jitter to achieve\ninvariant representations, embedding semantically the same inputs despite\ntransformations. However, this can degrade performance in tasks requiring\nprecise features, such as localization or flower classification. To address\nthis, recent research incorporates equivariant representation learning, which\ncaptures transformation-sensitive information. However, current methods depend\non transformation labels and thus struggle with interdependency and complex\ntransformations. We propose Self-supervised Transformation Learning (STL),\nreplacing transformation labels with transformation representations derived\nfrom image pairs. The proposed method ensures transformation representation is\nimage-invariant and learns corresponding equivariant transformations, enhancing\nperformance without increased batch complexity. We demonstrate the approach's\neffectiveness across diverse classification and detection tasks, outperforming\nexisting methods in 7 out of 11 benchmarks and excelling in detection. By\nintegrating complex transformations like AugMix, unusable by prior equivariant\nmethods, this approach enhances performance across tasks, underscoring its\nadaptability and resilience. Additionally, its compatibility with various base\nmodels highlights its flexibility and broad applicability. The code is\navailable at https://github.com/jaemyung-u/stl.\n","authors":["Jaemyung Yu","Jaehyun Choi","Dong-Jae Lee","HyeongGwon Hong","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2501.08712v1.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n 2024)"},{"id":"http://arxiv.org/abs/2501.08115v2","updated":"2025-01-15T10:05:39Z","published":"2025-01-14T13:46:07Z","title":"RoHan: Robust Hand Detection in Operation Room","summary":" Hand-specific localization has garnered significant interest within the\ncomputer vision community. Although there are numerous datasets with hand\nannotations from various angles and settings, domain transfer techniques\nfrequently struggle in surgical environments. This is mainly due to the limited\navailability of gloved hand instances and the unique challenges of operating\nrooms (ORs). Thus, hand-detection models tailored to OR settings require\nextensive training and expensive annotation processes. To overcome these\nchallenges, we present \"RoHan\" - a novel approach for robust hand detection in\nthe OR, leveraging advanced semi-supervised domain adaptation techniques to\ntackle the challenges of varying recording conditions, diverse glove colors,\nand occlusions common in surgical settings. Our methodology encompasses two\nmain stages: (1) data augmentation strategy that utilizes \"Artificial Gloves,\"\na method for augmenting publicly available hand datasets with synthetic images\nof hands-wearing gloves; (2) semi-supervised domain adaptation pipeline that\nimproves detection performance in real-world OR settings through iterative\nprediction refinement and efficient frame filtering. We evaluate our method\nusing two datasets: simulated enterotomy repair and saphenous vein graft\nharvesting. \"RoHan\" substantially reduces the need for extensive labeling and\nmodel training, paving the way for the practical implementation of hand\ndetection technologies in medical settings.\n","authors":["Roi Papo","Sapir Gershov","Tom Friedman","Itay Or","Gil Bolotin","Shlomi Laufer"],"pdf_url":"https://arxiv.org/pdf/2501.08115v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2410.05301v2","updated":"2025-01-15T09:42:42Z","published":"2024-10-04T12:22:54Z","title":"Diffusion-based Unsupervised Audio-visual Speech Enhancement","summary":" This paper proposes a new unsupervised audio-visual speech enhancement (AVSE)\napproach that combines a diffusion-based audio-visual speech generative model\nwith a non-negative matrix factorization (NMF) noise model. First, the\ndiffusion model is pre-trained on clean speech conditioned on corresponding\nvideo data to simulate the speech generative distribution. This pre-trained\nmodel is then paired with the NMF-based noise model to estimate clean speech\niteratively. Specifically, a diffusion-based posterior sampling approach is\nimplemented within the reverse diffusion process, where after each iteration, a\nspeech estimate is obtained and used to update the noise parameters.\nExperimental results confirm that the proposed AVSE approach not only\noutperforms its audio-only counterpart but also generalizes better than a\nrecent supervised-generative AVSE method. Additionally, the new inference\nalgorithm offers a better balance between inference speed and performance\ncompared to the previous diffusion-based method. Code and demo available at:\nhttps://jeaneudesayilo.github.io/fast_UdiffSE\n","authors":["Jean-Eudes Ayilo","Mostafa Sadeghi","Romain Serizel","Xavier Alameda-Pineda"],"pdf_url":"https://arxiv.org/pdf/2410.05301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.06787v2","updated":"2025-01-15T09:39:03Z","published":"2025-01-12T11:54:46Z","title":"Improving Pain Classification using Spatio-Temporal Deep Learning\n Approaches with Facial Expressions","summary":" Pain management and severity detection are crucial for effective treatment,\nyet traditional self-reporting methods are subjective and may be unsuitable for\nnon-verbal individuals (people with limited speaking skills). To address this\nlimitation, we explore automated pain detection using facial expressions. Our\nstudy leverages deep learning techniques to improve pain assessment by\nanalyzing facial images from the Pain Emotion Faces Database (PEMF). We propose\ntwo novel approaches1: (1) a hybrid ConvNeXt model combined with Long\nShort-Term Memory (LSTM) blocks to analyze video frames and predict pain\npresence, and (2) a Spatio-Temporal Graph Convolution Network (STGCN)\nintegrated with LSTM to process landmarks from facial images for pain\ndetection. Our work represents the first use of the PEMF dataset for binary\npain classification and demonstrates the effectiveness of these models through\nextensive experimentation. The results highlight the potential of combining\nspatial and temporal features for enhanced pain detection, offering a promising\nadvancement in objective pain assessment methodologies.\n","authors":["Aafaf Ridouan","Amine Bohi","Youssef Mourchid"],"pdf_url":"https://arxiv.org/pdf/2501.06787v2.pdf","comment":"8 pages, 3 figures, 3 tables. Accepted and presented at the 18th\n International Conference on Machine Vision (ICMV 2024), Edinburgh, UK"},{"id":"http://arxiv.org/abs/2501.08682v1","updated":"2025-01-15T09:22:38Z","published":"2025-01-15T09:22:38Z","title":"RealVVT: Towards Photorealistic Video Virtual Try-on via Spatio-Temporal\n Consistency","summary":" Virtual try-on has emerged as a pivotal task at the intersection of computer\nvision and fashion, aimed at digitally simulating how clothing items fit on the\nhuman body. Despite notable progress in single-image virtual try-on (VTO),\ncurrent methodologies often struggle to preserve a consistent and authentic\nappearance of clothing across extended video sequences. This challenge arises\nfrom the complexities of capturing dynamic human pose and maintaining target\nclothing characteristics. We leverage pre-existing video foundation models to\nintroduce RealVVT, a photoRealistic Video Virtual Try-on framework tailored to\nbolster stability and realism within dynamic video contexts. Our methodology\nencompasses a Clothing & Temporal Consistency strategy, an Agnostic-guided\nAttention Focus Loss mechanism to ensure spatial consistency, and a Pose-guided\nLong Video VTO technique adept at handling extended video sequences.Extensive\nexperiments across various datasets confirms that our approach outperforms\nexisting state-of-the-art models in both single-image and video VTO tasks,\noffering a viable solution for practical applications within the realms of\nfashion e-commerce and virtual fitting environments.\n","authors":["Siqi Li","Zhengkai Jiang","Jiawei Zhou","Zhihong Liu","Xiaowei Chi","Haoqian Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08682v1.pdf","comment":"10 pages (8 pages main text, 2 pages references), 5 figures in the\n main text, and 4 pages supplementary materials with 3 additional figures"},{"id":"http://arxiv.org/abs/2501.08676v1","updated":"2025-01-15T09:07:12Z","published":"2025-01-15T09:07:12Z","title":"FlexiClip: Locality-Preserving Free-Form Character Animation","summary":" Animating clipart images with seamless motion while maintaining visual\nfidelity and temporal coherence presents significant challenges. Existing\nmethods, such as AniClipart, effectively model spatial deformations but often\nfail to ensure smooth temporal transitions, resulting in artifacts like abrupt\nmotions and geometric distortions. Similarly, text-to-video (T2V) and\nimage-to-video (I2V) models struggle to handle clipart due to the mismatch in\nstatistical properties between natural video and clipart styles. This paper\nintroduces FlexiClip, a novel approach designed to overcome these limitations\nby addressing the intertwined challenges of temporal consistency and geometric\nintegrity. FlexiClip extends traditional B\\'ezier curve-based trajectory\nmodeling with key innovations: temporal Jacobians to correct motion dynamics\nincrementally, continuous-time modeling via probability flow ODEs (pfODEs) to\nmitigate temporal noise, and a flow matching loss inspired by GFlowNet\nprinciples to optimize smooth motion transitions. These enhancements ensure\ncoherent animations across complex scenarios involving rapid movements and\nnon-rigid deformations. Extensive experiments validate the effectiveness of\nFlexiClip in generating animations that are not only smooth and natural but\nalso structurally consistent across diverse clipart types, including humans and\nanimals. By integrating spatial and temporal modeling with pre-trained video\ndiffusion models, FlexiClip sets a new standard for high-quality clipart\nanimation, offering robust performance across a wide range of visual content.\nProject Page: https://creative-gen.github.io/flexiclip.github.io/\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2501.08676v1.pdf","comment":"13 pages, 4 figures, 7 tables"},{"id":"http://arxiv.org/abs/2501.08672v1","updated":"2025-01-15T09:04:56Z","published":"2025-01-15T09:04:56Z","title":"GS-LIVO: Real-Time LiDAR, Inertial, and Visual Multi-sensor Fused\n Odometry with Gaussian Mapping","summary":" In recent years, 3D Gaussian splatting (3D-GS) has emerged as a novel scene\nrepresentation approach. However, existing vision-only 3D-GS methods often rely\non hand-crafted heuristics for point-cloud densification and face challenges in\nhandling occlusions and high GPU memory and computation consumption.\nLiDAR-Inertial-Visual (LIV) sensor configuration has demonstrated superior\nperformance in localization and dense mapping by leveraging complementary\nsensing characteristics: rich texture information from cameras, precise\ngeometric measurements from LiDAR, and high-frequency motion data from IMU.\nInspired by this, we propose a novel real-time Gaussian-based simultaneous\nlocalization and mapping (SLAM) system. Our map system comprises a global\nGaussian map and a sliding window of Gaussians, along with an IESKF-based\nodometry. The global Gaussian map consists of hash-indexed voxels organized in\na recursive octree, effectively covering sparse spatial volumes while adapting\nto different levels of detail and scales. The Gaussian map is initialized\nthrough multi-sensor fusion and optimized with photometric gradients. Our\nsystem incrementally maintains a sliding window of Gaussians, significantly\nreducing GPU computation and memory consumption by only optimizing the map\nwithin the sliding window. Moreover, we implement a tightly coupled\nmulti-sensor fusion odometry with an iterative error state Kalman filter\n(IESKF), leveraging real-time updating and rendering of the Gaussian map. Our\nsystem represents the first real-time Gaussian-based SLAM framework deployable\non resource-constrained embedded systems, demonstrated on the NVIDIA Jetson\nOrin NX platform. The framework achieves real-time performance while\nmaintaining robust multi-sensor fusion capabilities. All implementation\nalgorithms, hardware designs, and CAD models will be publicly available.\n","authors":["Sheng Hong","Chunran Zheng","Yishu Shen","Changze Li","Fu Zhang","Tong Qin","Shaojie Shen"],"pdf_url":"https://arxiv.org/pdf/2501.08672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08667v1","updated":"2025-01-15T09:02:04Z","published":"2025-01-15T09:02:04Z","title":"TimeFlow: Longitudinal Brain Image Registration and Aging Progression\n Analysis","summary":" Predicting future brain states is crucial for understanding healthy aging and\nneurodegenerative diseases. Longitudinal brain MRI registration, a cornerstone\nfor such analyses, has long been limited by its inability to forecast future\ndevelopments, reliance on extensive, dense longitudinal data, and the need to\nbalance registration accuracy with temporal smoothness. In this work, we\npresent \\emph{TimeFlow}, a novel framework for longitudinal brain MRI\nregistration that overcomes all these challenges. Leveraging a U-Net\narchitecture with temporal conditioning inspired by diffusion models, TimeFlow\nenables accurate longitudinal registration and facilitates prospective analyses\nthrough future image prediction. Unlike traditional methods that depend on\nexplicit smoothness regularizers and dense sequential data, TimeFlow achieves\ntemporal consistency and continuity without these constraints. Experimental\nresults highlight its superior performance in both future timepoint prediction\nand registration accuracy compared to state-of-the-art methods. Additionally,\nTimeFlow supports novel biological brain aging analyses, effectively\ndifferentiating neurodegenerative conditions from healthy aging. It eliminates\nthe need for segmentation, thereby avoiding the challenges of non-trivial\nannotation and inconsistent segmentation errors. TimeFlow paves the way for\naccurate, data-efficient, and annotation-free prospective analyses of brain\naging and chronic diseases.\n","authors":["Bailiang Jian","Jiazhen Pan","Yitong Li","Fabian Bongratz","Ruochen Li","Daniel Rueckert","Benedikt Wiestler","Christian Wachinger"],"pdf_url":"https://arxiv.org/pdf/2501.08667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08665v1","updated":"2025-01-15T09:00:32Z","published":"2025-01-15T09:00:32Z","title":"A Survey on Facial Image Privacy Preservation in Cloud-Based Services","summary":" Facial recognition models are increasingly employed by commercial\nenterprises, government agencies, and cloud service providers for identity\nverification, consumer services, and surveillance. These models are often\ntrained using vast amounts of facial data processed and stored in cloud-based\nplatforms, raising significant privacy concerns. Users' facial images may be\nexploited without their consent, leading to potential data breaches and misuse.\nThis survey presents a comprehensive review of current methods aimed at\npreserving facial image privacy in cloud-based services. We categorize these\nmethods into two primary approaches: image obfuscation-based protection and\nadversarial perturbation-based protection. We provide an in-depth analysis of\nboth categories, offering qualitative and quantitative comparisons of their\neffectiveness. Additionally, we highlight unresolved challenges and propose\nfuture research directions to improve privacy preservation in cloud computing\nenvironments.\n","authors":["Chen Chen","Mengyuan Sun","Xueluan Gong","Yanjiao Chen","Qian Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08662v1","updated":"2025-01-15T08:57:41Z","published":"2025-01-15T08:57:41Z","title":"Product of Gaussian Mixture Diffusion Model for non-linear MRI Inversion","summary":" Diffusion models have recently shown remarkable results in magnetic resonance\nimaging reconstruction. However, the employed networks typically are black-box\nestimators of the (smoothed) prior score with tens of millions of parameters,\nrestricting interpretability and increasing reconstruction time. Furthermore,\nparallel imaging reconstruction algorithms either rely on off-line coil\nsensitivity estimation, which is prone to misalignment and restricting sampling\ntrajectories, or perform per-coil reconstruction, making the computational cost\nproportional to the number of coils. To overcome this, we jointly reconstruct\nthe image and the coil sensitivities using the lightweight,\nparameter-efficient, and interpretable product of Gaussian mixture diffusion\nmodel as an image prior and a classical smoothness priors on the coil\nsensitivities. The proposed method delivers promising results while allowing\nfor fast inference and demonstrating robustness to contrast out-of-distribution\ndata and sampling trajectories, comparable to classical variational penalties\nsuch as total variation. Finally, the probabilistic formulation allows the\ncalculation of the posterior expectation and pixel-wise variance.\n","authors":["Laurenz Nagler","Martin Zach","Thomas Pock"],"pdf_url":"https://arxiv.org/pdf/2501.08662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08659v1","updated":"2025-01-15T08:50:52Z","published":"2025-01-15T08:50:52Z","title":"BRIGHT-VO: Brightness-Guided Hybrid Transformer for Visual Odometry with\n Multi-modality Refinement Module","summary":" Visual odometry (VO) plays a crucial role in autonomous driving, robotic\nnavigation, and other related tasks by estimating the position and orientation\nof a camera based on visual input. Significant progress has been made in\ndata-driven VO methods, particularly those leveraging deep learning techniques\nto extract image features and estimate camera poses. However, these methods\noften struggle in low-light conditions because of the reduced visibility of\nfeatures and the increased difficulty of matching keypoints. To address this\nlimitation, we introduce BrightVO, a novel VO model based on Transformer\narchitecture, which not only performs front-end visual feature extraction, but\nalso incorporates a multi-modality refinement module in the back-end that\nintegrates Inertial Measurement Unit (IMU) data. Using pose graph optimization,\nthis module iteratively refines pose estimates to reduce errors and improve\nboth accuracy and robustness. Furthermore, we create a synthetic low-light\ndataset, KiC4R, which includes a variety of lighting conditions to facilitate\nthe training and evaluation of VO frameworks in challenging environments.\nExperimental results demonstrate that BrightVO achieves state-of-the-art\nperformance on both the KiC4R dataset and the KITTI benchmarks. Specifically,\nit provides an average improvement of 20% in pose estimation accuracy in normal\noutdoor environments and 259% in low-light conditions, outperforming existing\nmethods. For widespread use and further development, the research work is fully\nopen-source at https://github.com/Anastasiawd/BrightVO.\n","authors":["Dongzhihan Wang","Yang Yang","Liang Xu"],"pdf_url":"https://arxiv.org/pdf/2501.08659v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2501.08654v1","updated":"2025-01-15T08:43:48Z","published":"2025-01-15T08:43:48Z","title":"StereoGen: High-quality Stereo Image Generation from a Single Image","summary":" State-of-the-art supervised stereo matching methods have achieved amazing\nresults on various benchmarks. However, these data-driven methods suffer from\ngeneralization to real-world scenarios due to the lack of real-world annotated\ndata. In this paper, we propose StereoGen, a novel pipeline for high-quality\nstereo image generation. This pipeline utilizes arbitrary single images as left\nimages and pseudo disparities generated by a monocular depth estimation model\nto synthesize high-quality corresponding right images. Unlike previous methods\nthat fill the occluded area in warped right images using random backgrounds or\nusing convolutions to take nearby pixels selectively, we fine-tune a diffusion\ninpainting model to recover the background. Images generated by our model\npossess better details and undamaged semantic structures. Besides, we propose\nTraining-free Confidence Generation and Adaptive Disparity Selection. The\nformer suppresses the negative effect of harmful pseudo ground truth during\nstereo training, while the latter helps generate a wider disparity distribution\nand better synthetic images. Experiments show that models trained under our\npipeline achieve state-of-the-art zero-shot generalization results among all\npublished methods. The code will be available upon publication of the paper.\n","authors":["Xianqi Wang","Hao Yang","Gangwei Xu","Junda Cheng","Min Lin","Yong Deng","Jinliang Zang","Yurui Chen","Xin Yang"],"pdf_url":"https://arxiv.org/pdf/2501.08654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.02640v3","updated":"2025-01-15T08:41:38Z","published":"2025-01-05T20:05:10Z","title":"Multispectral Pedestrian Detection with Sparsely Annotated Label","summary":" Although existing Sparsely Annotated Object Detection (SAOD) approches have\nmade progress in handling sparsely annotated environments in multispectral\ndomain, where only some pedestrians are annotated, they still have the\nfollowing limitations: (i) they lack considerations for improving the quality\nof pseudo-labels for missing annotations, and (ii) they rely on fixed ground\ntruth annotations, which leads to learning only a limited range of pedestrian\nvisual appearances in the multispectral domain. To address these issues, we\npropose a novel framework called Sparsely Annotated Multispectral Pedestrian\nDetection (SAMPD). For limitation (i), we introduce Multispectral\nPedestrian-aware Adaptive Weight (MPAW) and Positive Pseudo-label Enhancement\n(PPE) module. Utilizing multispectral knowledge, these modules ensure the\ngeneration of high-quality pseudo-labels and enable effective learning by\nincreasing weights for high-quality pseudo-labels based on modality\ncharacteristics. To address limitation (ii), we propose an Adaptive Pedestrian\nRetrieval Augmentation (APRA) module, which adaptively incorporates pedestrian\npatches from ground-truth and dynamically integrates high-quality pseudo-labels\nwith the ground-truth, facilitating a more diverse learning pool of\npedestrians. Extensive experimental results demonstrate that our SAMPD\nsignificantly enhances performance in sparsely annotated environments within\nthe multispectral domain.\n","authors":["Chan Lee","Seungho Shin","Gyeong-Moon Park","Jung Uk Kim"],"pdf_url":"https://arxiv.org/pdf/2501.02640v3.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2501.08649v1","updated":"2025-01-15T08:24:35Z","published":"2025-01-15T08:24:35Z","title":"Joint Learning of Depth and Appearance for Portrait Image Animation","summary":" 2D portrait animation has experienced significant advancements in recent\nyears. Much research has utilized the prior knowledge embedded in large\ngenerative diffusion models to enhance high-quality image manipulation.\nHowever, most methods only focus on generating RGB images as output, and the\nco-generation of consistent visual plus 3D output remains largely\nunder-explored. In our work, we propose to jointly learn the visual appearance\nand depth simultaneously in a diffusion-based portrait image generator. Our\nmethod embraces the end-to-end diffusion paradigm and introduces a new\narchitecture suitable for learning this conditional joint distribution,\nconsisting of a reference network and a channel-expanded diffusion backbone.\nOnce trained, our framework can be efficiently adapted to various downstream\napplications, such as facial depth-to-image and image-to-depth generation,\nportrait relighting, and audio-driven talking head animation with consistent 3D\noutput.\n","authors":["Xinya Ji","Gaspard Zoss","Prashanth Chandran","Lingchen Yang","Xun Cao","Barbara Solenthaler","Derek Bradley"],"pdf_url":"https://arxiv.org/pdf/2501.08649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08643v1","updated":"2025-01-15T08:11:24Z","published":"2025-01-15T08:11:24Z","title":"MonSter: Marry Monodepth to Stereo Unleashes Power","summary":" Stereo matching recovers depth from image correspondences. Existing methods\nstruggle to handle ill-posed regions with limited matching cues, such as\nocclusions and textureless areas. To address this, we propose MonSter, a novel\nmethod that leverages the complementary strengths of monocular depth estimation\nand stereo matching. MonSter integrates monocular depth and stereo matching\ninto a dual-branch architecture to iteratively improve each other.\nConfidence-based guidance adaptively selects reliable stereo cues for monodepth\nscale-shift recovery. The refined monodepth is in turn guides stereo\neffectively at ill-posed regions. Such iterative mutual enhancement enables\nMonSter to evolve monodepth priors from coarse object-level structures to\npixel-level geometry, fully unlocking the potential of stereo matching. As\nshown in Fig.1, MonSter ranks 1st across five most commonly used leaderboards\n-- SceneFlow, KITTI 2012, KITTI 2015, Middlebury, and ETH3D. Achieving up to\n49.5% improvements (Bad 1.0 on ETH3D) over the previous best method.\nComprehensive analysis verifies the effectiveness of MonSter in ill-posed\nregions. In terms of zero-shot generalization, MonSter significantly and\nconsistently outperforms state-of-the-art across the board. The code is\npublicly available at: https://github.com/Junda24/MonSter.\n","authors":["Junda Cheng","Longliang Liu","Gangwei Xu","Xianqi Wang","Zhaoxing Zhang","Yong Deng","Jinliang Zang","Yurui Chen","Zhipeng Cai","Xin Yang"],"pdf_url":"https://arxiv.org/pdf/2501.08643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08639v1","updated":"2025-01-15T08:04:44Z","published":"2025-01-15T08:04:44Z","title":"Detecting Wildfire Flame and Smoke through Edge Computing using Transfer\n Learning Enhanced Deep Learning Models","summary":" Autonomous unmanned aerial vehicles (UAVs) integrated with edge computing\ncapabilities empower real-time data processing directly on the device,\ndramatically reducing latency in critical scenarios such as wildfire detection.\nThis study underscores Transfer Learning's (TL) significance in boosting the\nperformance of object detectors for identifying wildfire smoke and flames,\nespecially when trained on limited datasets, and investigates the impact TL has\non edge computing metrics. With the latter focusing how TL-enhanced You Only\nLook Once (YOLO) models perform in terms of inference time, power usage, and\nenergy consumption when using edge computing devices. This study utilizes the\nAerial Fire and Smoke Essential (AFSE) dataset as the target, with the Flame\nand Smoke Detection Dataset (FASDD) and the Microsoft Common Objects in Context\n(COCO) dataset serving as source datasets. We explore a two-stage cascaded TL\nmethod, utilizing D-Fire or FASDD as initial stage target datasets and AFSE as\nthe subsequent stage. Through fine-tuning, TL significantly enhances detection\nprecision, achieving up to 79.2% mean Average Precision (mAP@0.5), reduces\ntraining time, and increases model generalizability across the AFSE dataset.\nHowever, cascaded TL yielded no notable improvements and TL alone did not\nbenefit the edge computing metrics evaluated. Lastly, this work found that\nYOLOv5n remains a powerful model when lacking hardware acceleration, finding\nthat YOLOv5n can process images nearly twice as fast as its newer counterpart,\nYOLO11n. Overall, the results affirm TL's role in augmenting the accuracy of\nobject detectors while also illustrating that additional enhancements are\nneeded to improve edge computing performance.\n","authors":["Giovanny Vazquez","Shengjie Zhai","Mei Yang"],"pdf_url":"https://arxiv.org/pdf/2501.08639v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.05095v5","updated":"2025-01-15T07:37:21Z","published":"2024-05-08T14:44:34Z","title":"Approximation properties relative to continuous scale space for hybrid\n discretizations of Gaussian derivative operators","summary":" This paper presents an analysis of properties of two hybrid discretization\nmethods for Gaussian derivatives, based on convolutions with either the\nnormalized sampled Gaussian kernel or the integrated Gaussian kernel followed\nby central differences. The motivation for studying these discretization\nmethods is that in situations when multiple spatial derivatives of different\norder are needed at the same scale level, they can be computed significantly\nmore efficiently compared to more direct derivative approximations based on\nexplicit convolutions with either sampled Gaussian kernels or integrated\nGaussian kernels.\n While these computational benefits do also hold for the genuinely discrete\napproach for computing discrete analogues of Gaussian derivatives, based on\nconvolution with the discrete analogue of the Gaussian kernel followed by\ncentral differences, the underlying mathematical primitives for the discrete\nanalogue of the Gaussian kernel, in terms of modified Bessel functions of\ninteger order, may not be available in certain frameworks for image processing,\nsuch as when performing deep learning based on scale-parameterized filters in\nterms of Gaussian derivatives, with learning of the scale levels.\n In this paper, we present a characterization of the properties of these\nhybrid discretization methods, in terms of quantitative performance measures\nconcerning the amount of spatial smoothing that they imply, as well as the\nrelative consistency of scale estimates obtained from scale-invariant feature\ndetectors with automatic scale selection, with an emphasis on the behaviour for\nvery small values of the scale parameter, which may differ significantly from\ncorresponding results obtained from the fully continuous scale-space theory, as\nwell as between different types of discretization methods.\n","authors":["Tony Lindeberg"],"pdf_url":"https://arxiv.org/pdf/2405.05095v5.pdf","comment":"23 pages, 9 figures. arXiv admin note: text overlap with\n arXiv:2311.11317"},{"id":"http://arxiv.org/abs/2411.15098v4","updated":"2025-01-15T07:30:29Z","published":"2024-11-22T17:55:15Z","title":"OminiControl: Minimal and Universal Control for Diffusion Transformer","summary":" In this paper, we introduce OminiControl, a highly versatile and\nparameter-efficient framework that integrates image conditions into pre-trained\nDiffusion Transformer (DiT) models. At its core, OminiControl leverages a\nparameter reuse mechanism, enabling the DiT to encode image conditions using\nitself as a powerful backbone and process them with its flexible multi-modal\nattention processors. Unlike existing methods, which rely heavily on additional\nencoder modules with complex architectures, OminiControl (1) effectively and\nefficiently incorporates injected image conditions with only ~0.1% additional\nparameters, and (2) addresses a wide range of image conditioning tasks in a\nunified manner, including subject-driven generation and spatially-aligned\nconditions such as edges, depth, and more. Remarkably, these capabilities are\nachieved by training on images generated by the DiT itself, which is\nparticularly beneficial for subject-driven generation. Extensive evaluations\ndemonstrate that OminiControl outperforms existing UNet-based and DiT-adapted\nmodels in both subject-driven and spatially-aligned conditional generation.\nAdditionally, we release our training dataset, Subjects200K, a diverse\ncollection of over 200,000 identity-consistent images, along with an efficient\ndata synthesis pipeline to advance research in subject-consistent generation.\n","authors":["Zhenxiong Tan","Songhua Liu","Xingyi Yang","Qiaochu Xue","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2411.15098v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08629v1","updated":"2025-01-15T07:24:15Z","published":"2025-01-15T07:24:15Z","title":"Self-Organizing Edge Computing Distribution Framework for Visual SLAM","summary":" Localization within a known environment is a crucial capability for mobile\nrobots. Simultaneous Localization and Mapping (SLAM) is a prominent solution to\nthis problem. SLAM is a framework that consists of a diverse set of\ncomputational tasks ranging from real-time tracking to computation-intensive\nmap optimization. This combination can present a challenge for resource-limited\nmobile robots. Previously, edge-assisted SLAM methods have demonstrated\npromising real-time execution capabilities by offloading heavy computations\nwhile performing real-time tracking onboard. However, the common approach of\nutilizing a client-server architecture for offloading is sensitive to server\nand network failures. In this article, we propose a novel edge-assisted SLAM\nframework capable of self-organizing fully distributed SLAM execution across a\nnetwork of devices or functioning on a single device without connectivity. The\narchitecture consists of three layers and is designed to be device-agnostic,\nresilient to network failures, and minimally invasive to the core SLAM system.\nWe have implemented and demonstrated the framework for monocular ORB SLAM3 and\nevaluated it in both fully distributed and standalone SLAM configurations\nagainst the ORB SLAM3. The experiment results demonstrate that the proposed\ndesign matches the accuracy and resource utilization of the monolithic approach\nwhile enabling collaborative execution.\n","authors":["Jussi Kalliola","Lauri Suomela","Sergio Moreschini","David Hästbacka"],"pdf_url":"https://arxiv.org/pdf/2501.08629v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.10919v3","updated":"2025-01-15T07:17:58Z","published":"2024-08-20T15:04:14Z","title":"CrossFi: A Cross Domain Wi-Fi Sensing Framework Based on Siamese Network","summary":" In recent years, Wi-Fi sensing has garnered significant attention due to its\nnumerous benefits, such as privacy protection, low cost, and penetration\nability. Extensive research has been conducted in this field, focusing on areas\nsuch as gesture recognition, people identification, and fall detection.\nHowever, many data-driven methods encounter challenges related to domain shift,\nwhere the model fails to perform well in environments different from the\ntraining data. One major factor contributing to this issue is the limited\navailability of Wi-Fi sensing datasets, which makes models learn excessive\nirrelevant information and over-fit to the training set. Unfortunately,\ncollecting large-scale Wi-Fi sensing datasets across diverse scenarios is a\nchallenging task. To address this problem, we propose CrossFi, a siamese\nnetwork-based approach that excels in both in-domain scenario and cross-domain\nscenario, including few-shot, zero-shot scenarios, and even works in few-shot\nnew-class scenario where testing set contains new categories. The core\ncomponent of CrossFi is a sample-similarity calculation network called CSi-Net,\nwhich improves the structure of the siamese network by using an attention\nmechanism to capture similarity information, instead of simply calculating the\ndistance or cosine similarity. Based on it, we develop an extra Weight-Net that\ncan generate a template for each class, so that our CrossFi can work in\ndifferent scenarios. Experimental results demonstrate that our CrossFi achieves\nstate-of-the-art performance across various scenarios. In gesture recognition\ntask, our CrossFi achieves an accuracy of 98.17% in in-domain scenario, 91.72%\nin one-shot cross-domain scenario, 64.81% in zero-shot cross-domain scenario,\nand 84.75% in one-shot new-class scenario. The code for our model is publicly\navailable at https://github.com/RS2002/CrossFi.\n","authors":["Zijian Zhao","Tingwei Chen","Zhijie Cai","Xiaoyang Li","Hang Li","Qimei Chen","Guangxu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.10919v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00330v2","updated":"2025-01-15T06:57:25Z","published":"2024-11-01T03:08:10Z","title":"Multiple Information Prompt Learning for Cloth-Changing Person\n Re-Identification","summary":" Cloth-changing person re-identification is a subject closer to the real\nworld, which focuses on solving the problem of person re-identification after\npedestrians change clothes. The primary challenge in this field is to overcome\nthe complex interplay between intra-class and inter-class variations and to\nidentify features that remain unaffected by changes in appearance. Sufficient\ndata collection for model training would significantly aid in addressing this\nproblem. However, it is challenging to gather diverse datasets in practice.\nCurrent methods focus on implicitly learning identity information from the\noriginal image or introducing additional auxiliary models, which are largely\nlimited by the quality of the image and the performance of the additional\nmodel. To address these issues, inspired by prompt learning, we propose a novel\nmultiple information prompt learning (MIPL) scheme for cloth-changing person\nReID, which learns identity robust features through the common prompt guidance\nof multiple messages. Specifically, the clothing information stripping (CIS)\nmodule is designed to decouple the clothing information from the original RGB\nimage features to counteract the influence of clothing appearance. The\nBio-guided attention (BGA) module is proposed to increase the learning\nintensity of the model for key information. A dual-length hybrid patch (DHP)\nmodule is employed to make the features have diverse coverage to minimize the\nimpact of feature bias. Extensive experiments demonstrate that the proposed\nmethod outperforms all state-of-the-art methods on the LTCC, Celeb-reID,\nCeleb-reID-light, and CSCC datasets, achieving rank-1 scores of 74.8%, 73.3%,\n66.0%, and 88.1%, respectively. When compared to AIM (CVPR23), ACID (TIP23),\nand SCNet (MM23), MIPL achieves rank-1 improvements of 11.3%, 13.8%, and 7.9%,\nrespectively, on the PRCC dataset.\n","authors":["Shengxun Wei","Zan Gao","Chunjie Ma","Yibo Zhao","Weili Guan","Shengyong Chen"],"pdf_url":"https://arxiv.org/pdf/2411.00330v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.00961v2","updated":"2025-01-15T06:46:51Z","published":"2025-01-01T21:45:00Z","title":"The Silent Majority: Demystifying Memorization Effect in the Presence of\n Spurious Correlations","summary":" Machine learning models often rely on simple spurious features -- patterns in\ntraining data that correlate with targets but are not causally related to them,\nlike image backgrounds in foreground classification. This reliance typically\nleads to imbalanced test performance across minority and majority groups. In\nthis work, we take a closer look at the fundamental cause of such imbalanced\nperformance through the lens of memorization, which refers to the ability to\npredict accurately on \\textit{atypical} examples (minority groups) in the\ntraining set but failing in achieving the same accuracy in the testing set.\nThis paper systematically shows the ubiquitous existence of spurious features\nin a small set of neurons within the network, providing the first-ever evidence\nthat memorization may contribute to imbalanced group performance. Through three\nexperimental sources of converging empirical evidence, we find the property of\na small subset of neurons or channels in memorizing minority group information.\nInspired by these findings, we articulate the hypothesis: the imbalanced group\nperformance is a byproduct of ``noisy'' spurious memorization confined to a\nsmall set of neurons. To further substantiate this hypothesis, we show that\neliminating these unnecessary spurious memorization patterns via a novel\nframework during training can significantly affect the model performance on\nminority groups. Our experimental results across various architectures and\nbenchmarks offer new insights on how neural networks encode core and spurious\nknowledge, laying the groundwork for future research in demystifying robustness\nto spurious correlation.\n","authors":["Chenyu You","Haocheng Dai","Yifei Min","Jasjeet S. Sekhon","Sarang Joshi","James S. Duncan"],"pdf_url":"https://arxiv.org/pdf/2501.00961v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19599v3","updated":"2025-01-15T06:40:31Z","published":"2024-09-29T07:32:14Z","title":"DATransNet: Dynamic Attention Transformer Network for Infrared Small\n Target Detection","summary":" Infrared small target detection (ISTD) is widely used in civilian and\nmilitary applications. However, ISTD encounters several challenges, including\nthe tendency for small and dim targets to be obscured by complex backgrounds.To\naddress this issue, we propose the Dynamic Attention Transformer Network\n(DATransNet), which aims to extract and preserve edge information of small\ntargets.DATransNet employs the Dynamic Attention Transformer (DATrans),\nsimulating central difference convolutions (CDC) to extract and integrate\ngradient features with deeper features.Furthermore, we propose a global feature\nextraction module (GFEM) that offers a comprehensive perspective to prevent the\nnetwork from focusing solely on details while neglecting the background\ninformation. We compare the network with state-of-the-art (SOTA) approaches,\nand the results demonstrate that our method performs effectively. Our source\ncode is available at https://github.com/greekinRoma/DATransNet.\n","authors":["Chen Hu","Yian Huang","Kexuan Li","Luping Zhang","Chang Long","Yiming Zhu","Tian Pu","Zhenming Peng"],"pdf_url":"https://arxiv.org/pdf/2409.19599v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.03678v2","updated":"2025-01-15T06:32:05Z","published":"2022-06-08T05:04:43Z","title":"Ultra-High-Definition Image Deblurring via Multi-scale Cubic-Mixer","summary":" Currently, transformer-based algorithms are making a splash in the domain of\nimage deblurring. Their achievement depends on the self-attention mechanism\nwith CNN stem to model long range dependencies between tokens. Unfortunately,\nthis ear-pleasing pipeline introduces high computational complexity and makes\nit difficult to run an ultra-high-definition image on a single GPU in real\ntime. To trade-off accuracy and efficiency, the input degraded image is\ncomputed cyclically over three dimensional ($C$, $W$, and $H$) signals without\na self-attention mechanism. We term this deep network as Multi-scale\nCubic-Mixer, which is acted on both the real and imaginary components after\nfast Fourier transform to estimate the Fourier coefficients and thus obtain a\ndeblurred image. Furthermore, we combine the multi-scale cubic-mixer with a\nslicing strategy to generate high-quality results at a much lower computational\ncost. Experimental results demonstrate that the proposed algorithm performs\nfavorably against the state-of-the-art deblurring approaches on the several\nbenchmarks and a new ultra-high-definition dataset in terms of accuracy and\nspeed.\n","authors":["Xingchi Chen","Xiuyi Jia","Zhuoran Zheng"],"pdf_url":"https://arxiv.org/pdf/2206.03678v2.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2501.08609v1","updated":"2025-01-15T06:15:15Z","published":"2025-01-15T06:15:15Z","title":"Computerized Assessment of Motor Imitation for Distinguishing Autism in\n Video (CAMI-2DNet)","summary":" Motor imitation impairments are commonly reported in individuals with autism\nspectrum conditions (ASCs), suggesting that motor imitation could be used as a\nphenotype for addressing autism heterogeneity. Traditional methods for\nassessing motor imitation are subjective, labor-intensive, and require\nextensive human training. Modern Computerized Assessment of Motor Imitation\n(CAMI) methods, such as CAMI-3D for motion capture data and CAMI-2D for video\ndata, are less subjective. However, they rely on labor-intensive data\nnormalization and cleaning techniques, and human annotations for algorithm\ntraining. To address these challenges, we propose CAMI-2DNet, a scalable and\ninterpretable deep learning-based approach to motor imitation assessment in\nvideo data, which eliminates the need for data normalization, cleaning and\nannotation. CAMI-2DNet uses an encoder-decoder architecture to map a video to a\nmotion encoding that is disentangled from nuisance factors such as body shape\nand camera views. To learn a disentangled representation, we employ synthetic\ndata generated by motion retargeting of virtual characters through the\nreshuffling of motion, body shape, and camera views, as well as real\nparticipant data. To automatically assess how well an individual imitates an\nactor, we compute a similarity score between their motion encodings, and use it\nto discriminate individuals with ASCs from neurotypical (NT) individuals. Our\ncomparative analysis demonstrates that CAMI-2DNet has a strong correlation with\nhuman scores while outperforming CAMI-2D in discriminating ASC vs NT children.\nMoreover, CAMI-2DNet performs comparably to CAMI-3D while offering greater\npracticality by operating directly on video data and without the need for\nad-hoc data normalization and human annotations.\n","authors":["Kaleab A. Kinfu","Carolina Pacheco","Alice D. Sperry","Deana Crocetti","Bahar Tunçgenç","Stewart H. Mostofsky","René Vidal"],"pdf_url":"https://arxiv.org/pdf/2501.08609v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2407.01960v2","updated":"2025-01-15T06:06:31Z","published":"2024-07-02T05:31:59Z","title":"Zero-shot Video Restoration and Enhancement Using Pre-Trained Image\n Diffusion Model","summary":" Diffusion-based zero-shot image restoration and enhancement models have\nachieved great success in various tasks of image restoration and enhancement.\nHowever, directly applying them to video restoration and enhancement results in\nsevere temporal flickering artifacts. In this paper, we propose the first\nframework for zero-shot video restoration and enhancement based on the\npre-trained image diffusion model. By replacing the spatial self-attention\nlayer with the proposed short-long-range (SLR) temporal attention layer, the\npre-trained image diffusion model can take advantage of the temporal\ncorrelation between frames. We further propose temporal consistency guidance,\nspatial-temporal noise sharing, and an early stopping sampling strategy to\nimprove temporally consistent sampling. Our method is a plug-and-play module\nthat can be inserted into any diffusion-based image restoration or enhancement\nmethods to further improve their performance. Experimental results demonstrate\nthe superiority of our proposed method. Our code is available at\nhttps://github.com/cao-cong/ZVRD.\n","authors":["Cong Cao","Huanjing Yue","Xin Liu","Jingyu Yang"],"pdf_url":"https://arxiv.org/pdf/2407.01960v2.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2501.08605v1","updated":"2025-01-15T06:05:57Z","published":"2025-01-15T06:05:57Z","title":"PACF: Prototype Augmented Compact Features for Improving Domain Adaptive\n Object Detection","summary":" In recent years, there has been significant advancement in object detection.\nHowever, applying off-the-shelf detectors to a new domain leads to significant\nperformance drop, caused by the domain gap. These detectors exhibit\nhigher-variance class-conditional distributions in the target domain than that\nin the source domain, along with mean shift. To address this problem, we\npropose the Prototype Augmented Compact Features (PACF) framework to regularize\nthe distribution of intra-class features. Specifically, we provide an in-depth\ntheoretical analysis on the lower bound of the target features-related\nlikelihood and derive the prototype cross entropy loss to further calibrate the\ndistribution of target RoI features. Furthermore, a mutual regularization\nstrategy is designed to enable the linear and prototype-based classifiers to\nlearn from each other, promoting feature compactness while enhancing\ndiscriminability. Thanks to this PACF framework, we have obtained a more\ncompact cross-domain feature space, within which the variance of the target\nfeatures' class-conditional distributions has significantly decreased, and the\nclass-mean shift between the two domains has also been further reduced. The\nresults on different adaptation settings are state-of-the-art, which\ndemonstrate the board applicability and effectiveness of the proposed approach.\n","authors":["Chenguang Liu","Yongchao Feng","Yanan Zhang","Qingjie Liu","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2501.08605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08604v1","updated":"2025-01-15T06:04:18Z","published":"2025-01-15T06:04:18Z","title":"Watermarking in Diffusion Model: Gaussian Shading with Exact Diffusion\n Inversion via Coupled Transformations (EDICT)","summary":" This paper introduces a novel approach to enhance the performance of Gaussian\nShading, a prevalent watermarking technique, by integrating the Exact Diffusion\nInversion via Coupled Transformations (EDICT) framework. While Gaussian Shading\ntraditionally embeds watermarks in a noise latent space, followed by iterative\ndenoising for image generation and noise addition for watermark recovery, its\ninversion process is not exact, leading to potential watermark distortion. We\npropose to leverage EDICT's ability to derive exact inverse mappings to refine\nthis process. Our method involves duplicating the watermark-infused noisy\nlatent and employing a reciprocal, alternating denoising and noising scheme\nbetween the two latents, facilitated by EDICT. This allows for a more precise\nreconstruction of both the image and the embedded watermark. Empirical\nevaluation on standard datasets demonstrates that our integrated approach\nyields a slight, yet statistically significant improvement in watermark\nrecovery fidelity. These results highlight the potential of EDICT to enhance\nexisting diffusion-based watermarking techniques by providing a more accurate\nand robust inversion mechanism. To the best of our knowledge, this is the first\nwork to explore the synergy between EDICT and Gaussian Shading for digital\nwatermarking, opening new avenues for research in robust and high-fidelity\nwatermark embedding and extraction.\n","authors":["Krishna Panthi"],"pdf_url":"https://arxiv.org/pdf/2501.08604v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2501.08593v1","updated":"2025-01-15T05:36:41Z","published":"2025-01-15T05:36:41Z","title":"Image-to-Force Estimation for Soft Tissue Interaction in\n Robotic-Assisted Surgery Using Structured Light","summary":" For Minimally Invasive Surgical (MIS) robots, accurate haptic interaction\nforce feedback is essential for ensuring the safety of interacting with soft\ntissue. However, most existing MIS robotic systems cannot facilitate direct\nmeasurement of the interaction force with hardware sensors due to space\nlimitations. This letter introduces an effective vision-based scheme that\nutilizes a One-Shot structured light projection with a designed pattern on soft\ntissue coupled with haptic information processing through a trained\nimage-to-force neural network. The images captured from the endoscopic stereo\ncamera are analyzed to reconstruct high-resolution 3D point clouds for soft\ntissue deformation. Based on this, a modified PointNet-based force estimation\nmethod is proposed, which excels in representing the complex mechanical\nproperties of soft tissue. Numerical force interaction experiments are\nconducted on three silicon materials with different stiffness. The results\nvalidate the effectiveness of the proposed scheme.\n","authors":["Jiayin Wang","Mingfeng Yao","Yanran Wei","Xiaoyu Guo","Ayong Zheng","Weidong Zhao"],"pdf_url":"https://arxiv.org/pdf/2501.08593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00580v2","updated":"2025-01-15T05:30:24Z","published":"2024-11-30T20:40:10Z","title":"Continuous Concepts Removal in Text-to-image Diffusion Models","summary":" Text-to-image diffusion models have shown an impressive ability to generate\nhigh-quality images from input textual descriptions. However, concerns have\nbeen raised about the potential for these models to create content that\ninfringes on copyrights or depicts disturbing subject matter. Removing specific\nconcepts from these models is a promising potential solution to this problem.\nHowever, existing methods for concept removal do not work well in practical but\nchallenging scenarios where concepts need to be continuously removed.\nSpecifically, these methods lead to poor alignment between the text prompts and\nthe generated image after the continuous removal process. To address this\nissue, we propose a novel approach called CCRT that includes a designed\nknowledge distillation paradigm. It constrains the text-image alignment\nbehavior during the continuous concept removal process by using a set of text\nprompts generated through our genetic algorithm, which employs a designed\nfuzzing strategy. We conduct extensive experiments involving the removal of\nvarious concepts. The results evaluated through both algorithmic metrics and\nhuman studies demonstrate that our CCRT can effectively remove the targeted\nconcepts in a continuous manner while maintaining the high generation quality\n(e.g., text-image alignment) of the model.\n","authors":["Tingxu Han","Weisong Sun","Yanrong Hu","Chunrong Fang","Yonglong Zhang","Shiqing Ma","Tao Zheng","Zhenyu Chen","Zhenting Wang"],"pdf_url":"https://arxiv.org/pdf/2412.00580v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08580v1","updated":"2025-01-15T05:00:03Z","published":"2025-01-15T05:00:03Z","title":"Densely Connected Parameter-Efficient Tuning for Referring Image\n Segmentation","summary":" In the domain of computer vision, Parameter-Efficient Tuning (PET) is\nincreasingly replacing the traditional paradigm of pre-training followed by\nfull fine-tuning. PET is particularly favored for its effectiveness in large\nfoundation models, as it streamlines transfer learning costs and optimizes\nhardware utilization. However, the current PET methods are mainly designed for\nsingle-modal optimization. While some pioneering studies have undertaken\npreliminary explorations, they still remain at the level of aligned encoders\n(e.g., CLIP) and lack exploration of misaligned encoders. These methods show\nsub-optimal performance with misaligned encoders, as they fail to effectively\nalign the multimodal features during fine-tuning. In this paper, we introduce\nDETRIS, a parameter-efficient tuning framework designed to enhance low-rank\nvisual feature propagation by establishing dense interconnections between each\nlayer and all preceding layers, which enables effective cross-modal feature\ninteraction and adaptation to misaligned encoders. We also suggest using text\nadapters to improve textual features. Our simple yet efficient approach greatly\nsurpasses state-of-the-art methods with 0.9% to 1.8% backbone parameter\nupdates, evaluated on challenging benchmarks. Our project is available at\n\\url{https://github.com/jiaqihuang01/DETRIS}.\n","authors":["Jiaqi Huang","Zunnan Xu","Ting Liu","Yong Liu","Haonan Han","Kehong Yuan","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2501.08580v1.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2501.08577v1","updated":"2025-01-15T04:56:26Z","published":"2025-01-15T04:56:26Z","title":"Scalable and High-Quality Neural Implicit Representation for 3D\n Reconstruction","summary":" Various SDF-based neural implicit surface reconstruction methods have been\nproposed recently, and have demonstrated remarkable modeling capabilities.\nHowever, due to the global nature and limited representation ability of a\nsingle network, existing methods still suffer from many drawbacks, such as\nlimited accuracy and scale of the reconstruction. In this paper, we propose a\nversatile, scalable and high-quality neural implicit representation to address\nthese issues. We integrate a divide-and-conquer approach into the neural\nSDF-based reconstruction. Specifically, we model the object or scene as a\nfusion of multiple independent local neural SDFs with overlapping regions. The\nconstruction of our representation involves three key steps: (1) constructing\nthe distribution and overlap relationship of the local radiance fields based on\nobject structure or data distribution, (2) relative pose registration for\nadjacent local SDFs, and (3) SDF blending. Thanks to the independent\nrepresentation of each local region, our approach can not only achieve\nhigh-fidelity surface reconstruction, but also enable scalable scene\nreconstruction. Extensive experimental results demonstrate the effectiveness\nand practicality of our proposed method.\n","authors":["Leyuan Yang","Bailin Deng","Juyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.08577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02281v2","updated":"2025-01-15T04:51:48Z","published":"2024-11-04T17:09:58Z","title":"Conformal-in-the-Loop for Learning with Imbalanced Noisy Data","summary":" Class imbalance and label noise are pervasive in large-scale datasets, yet\nmuch of machine learning research assumes well-labeled, balanced data, which\nrarely reflects real world conditions. Existing approaches typically address\neither label noise or class imbalance in isolation, leading to suboptimal\nresults when both issues coexist. In this work, we propose\nConformal-in-the-Loop (CitL), a novel training framework that addresses both\nchallenges with a conformal prediction-based approach. CitL evaluates sample\nuncertainty to adjust weights and prune unreliable examples, enhancing model\nresilience and accuracy with minimal computational cost. Our extensive\nexperiments include a detailed analysis showing how CitL effectively emphasizes\nimpactful data in noisy, imbalanced datasets. Our results show that CitL\nconsistently boosts model performance, achieving up to a 6.1% increase in\nclassification accuracy and a 5.0 mIoU improvement in segmentation. Our code is\npublicly available: CitL.\n","authors":["John Brandon Graham-Knight","Jamil Fayyad","Nourhan Bayasi","Patricia Lasserre","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2411.02281v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2501.08575v1","updated":"2025-01-15T04:51:10Z","published":"2025-01-15T04:51:10Z","title":"GOTLoc: General Outdoor Text-based Localization Using Scene Graph\n Retrieval with OpenStreetMap","summary":" We propose GOTLoc, a robust localization method capable of operating even in\noutdoor environments where GPS signals are unavailable. The method achieves\nthis robust localization by leveraging comparisons between scene graphs\ngenerated from text descriptions and maps. Existing text-based localization\nstudies typically represent maps as point clouds and identify the most similar\nscenes by comparing embeddings of text and point cloud data. However, point\ncloud maps have limited scalability as it is impractical to pre-generate maps\nfor all outdoor spaces. Furthermore, their large data size makes it challenging\nto store and utilize them directly on actual robots. To address these issues,\nGOTLoc leverages compact data structures, such as scene graphs, to store\nspatial information, enabling individual robots to carry and utilize large\namounts of map data. Additionally, by utilizing publicly available map data,\nsuch as OpenStreetMap, which provides global information on outdoor spaces, we\neliminate the need for additional effort to create custom map data. For\nperformance evaluation, we utilized the KITTI360Pose dataset in conjunction\nwith corresponding OpenStreetMap data to compare the proposed method with\nexisting approaches. Our results demonstrate that the proposed method achieves\naccuracy comparable to algorithms relying on point cloud maps. Moreover, in\ncity-scale tests, GOTLoc required significantly less storage compared to point\ncloud-based methods and completed overall processing within a few seconds,\nvalidating its applicability to real-world robotics. Our code is available at\nhttps://github.com/donghwijung/GOTLoc.\n","authors":["Donghwi Jung","Keonwoo Kim","Seong-Woo Kim"],"pdf_url":"https://arxiv.org/pdf/2501.08575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08562v1","updated":"2025-01-15T04:07:06Z","published":"2025-01-15T04:07:06Z","title":"MIAFEx: An Attention-based Feature Extraction Method for Medical Image\n Classification","summary":" Feature extraction techniques are crucial in medical image classification;\nhowever, classical feature extractors in addition to traditional machine\nlearning classifiers often exhibit significant limitations in providing\nsufficient discriminative information for complex image sets. While\nConvolutional Neural Networks (CNNs) and Vision Transformer (ViT) have shown\npromise in feature extraction, they are prone to overfitting due to the\ninherent characteristics of medical imaging data, including small sample sizes\nor high intra-class variance. In this work, the Medical Image Attention-based\nFeature Extractor (MIAFEx) is proposed, a novel method that employs a learnable\nrefinement mechanism to enhance the classification token within the Transformer\nencoder architecture. This mechanism adjusts the token based on learned\nweights, improving the extraction of salient features and enhancing the model's\nadaptability to the challenges presented by medical imaging data. The MIAFEx\noutput features quality is compared against classical feature extractors using\ntraditional and hybrid classifiers. Also, the performance of these features is\ncompared against modern CNN and ViT models in classification tasks,\ndemonstrating its superiority in accuracy and robustness across multiple\ncomplex classification medical imaging datasets. This advantage is particularly\npronounced in scenarios with limited training data, where traditional and\nmodern models often struggle to generalize effectively. The source code of this\nproposal can be found at\nhttps://github.com/Oscar-RamosS/Medical-Image-Attention-based-Feature-Extractor-MIAFEx\n","authors":["Oscar Ramos-Soto","Jorge Ramos-Frutos","Ezequiel Perez-Zarate","Diego Oliva","Sandra E. Balderas-Mata"],"pdf_url":"https://arxiv.org/pdf/2501.08562v1.pdf","comment":"In preparation for Journal Submission"},{"id":"http://arxiv.org/abs/2501.08553v1","updated":"2025-01-15T03:28:14Z","published":"2025-01-15T03:28:14Z","title":"DynamicFace: High-Quality and Consistent Video Face Swapping using\n Composable 3D Facial Priors","summary":" Face swapping transfers the identity of a source face to a target face while\nretaining the attributes like expression, pose, hair, and background of the\ntarget face. Advanced face swapping methods have achieved attractive results.\nHowever, these methods often inadvertently transfer identity information from\nthe target face, compromising expression-related details and accurate identity.\nWe propose a novel method DynamicFace that leverages the power of diffusion\nmodel and plug-and-play temporal layers for video face swapping. First, we\nintroduce four fine-grained face conditions using 3D facial priors. All\nconditions are designed to be disentangled from each other for precise and\nunique control. Then, we adopt Face Former and ReferenceNet for high-level and\ndetailed identity injection. Through experiments on the FF++ dataset, we\ndemonstrate that our method achieves state-of-the-art results in face swapping,\nshowcasing superior image quality, identity preservation, and expression\naccuracy. Besides, our method could be easily transferred to video domain with\ntemporal attention layer. Our code and results will be available on the project\npage: https://dynamic-face.github.io/\n","authors":["Runqi Wang","Sijie Xu","Tianyao He","Yang Chen","Wei Zhu","Dejia Song","Nemo Chen","Xu Tang","Yao Hu"],"pdf_url":"https://arxiv.org/pdf/2501.08553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08549v1","updated":"2025-01-15T03:17:24Z","published":"2025-01-15T03:17:24Z","title":"The Devil is in Temporal Token: High Quality Video Reasoning\n Segmentation","summary":" Existing methods for Video Reasoning Segmentation rely heavily on a single\nspecial token to represent the object in the keyframe or the entire video,\ninadequately capturing spatial complexity and inter-frame motion. To overcome\nthese challenges, we propose VRS-HQ, an end-to-end video reasoning segmentation\napproach that leverages Multimodal Large Language Models (MLLMs) to inject rich\nspatiotemporal features into hierarchical tokens.Our key innovations include a\nTemporal Dynamic Aggregation (TDA) and a Token-driven Keyframe Selection (TKS).\nSpecifically, we design frame-level and temporal-level tokens that\nutilize MLLM's autoregressive learning to effectively capture both local and\nglobal information. Subsequently, we apply a similarity-based weighted fusion\nand frame selection strategy, then utilize SAM2 to perform keyframe\nsegmentation and propagation. To enhance keyframe localization accuracy, the\nTKS filters keyframes based on SAM2's occlusion scores during inference. VRS-HQ\nachieves state-of-the-art performance on ReVOS, surpassing VISA by\n5.9%/12.5%/9.1% in J&F scores across the three subsets. These results highlight\nthe strong temporal reasoning and segmentation capabilities of our method. Code\nand model weights will be released at VRS-HQ.\n","authors":["Sitong Gong","Yunzhi Zhuge","Lu Zhang","Zongxin Yang","Pingping Zhang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2501.08549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08545v1","updated":"2025-01-15T03:11:33Z","published":"2025-01-15T03:11:33Z","title":"Comprehensive Subjective and Objective Evaluation Method for\n Text-generated Video","summary":" Recent text-to-video (T2V) technology advancements, as demonstrated by models\nsuch as Gen3, Pika, and Sora, have significantly broadened its applicability\nand popularity. This progress has created a growing demand for accurate quality\nassessment metrics to evaluate the perceptual quality of text-generated videos\nand optimize video generation models. However, assessing the quality of\ntext-generated videos remains challenging due to the presence of highly complex\ndistortions, such as unnatural actions and phenomena that defy human cognition.\nTo address these challenges, we constructed a large-scale benchmark dataset for\n\\textbf{T}ext-generated \\textbf{V}ideo \\textbf{eval}uation,\n\\textbf{T2VEval-Bench}, comprising 148 textual words and 1,783 videos generated\nby 12 models. During the subjective evaluation, we collected five key scores:\noverall impression, video quality, aesthetic quality, realness, and text-video\nconsistency. For objective evaluation, we developed the \\textbf{T2VEval} model,\nwhich assesses videos across three branches: quality, authenticity, and\nconsistency. Using an attention-based fusion module, T2VEval effectively\nintegrates features from each branch and predicts scores with the aid of a\nlarge oracle model. Additionally, we implemented a progressive training\nstrategy, enabling each branch to learn targeted knowledge while maintaining\nsynergy with the others. Experimental results demonstrate that T2VEval achieves\nstate-of-the-art performance across multiple metrics. The dataset and code will\nbe open-sourced upon completion of the follow-up work.\n","authors":["Zelu Qi","Ping Shi","Shuqi Wang","Zhaoyang Zhang","Zefeng Ying","Da Pan"],"pdf_url":"https://arxiv.org/pdf/2501.08545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19727v2","updated":"2025-01-15T02:29:14Z","published":"2024-09-29T14:57:45Z","title":"Investigating the Effect of Network Pruning on Performance and\n Interpretability","summary":" Deep Neural Networks (DNNs) are often over-parameterized for their tasks and\ncan be compressed quite drastically by removing weights, a process called\npruning. We investigate the impact of different pruning techniques on the\nclassification performance and interpretability of GoogLeNet. We systematically\napply unstructured and structured pruning, as well as connection sparsity\n(pruning of input weights) methods to the network and analyze the outcomes\nregarding the network's performance on the validation set of ImageNet. We also\ncompare different retraining strategies, such as iterative pruning and one-shot\npruning. We find that with sufficient retraining epochs, the performance of the\nnetworks can approximate the performance of the default GoogLeNet - and even\nsurpass it in some cases. To assess interpretability, we employ the Mechanistic\nInterpretability Score (MIS) developed by Zimmermann et al. . Our experiments\nreveal that there is no significant relationship between interpretability and\npruning rate when using MIS as a measure. Additionally, we observe that\nnetworks with extremely low accuracy can still achieve high MIS scores,\nsuggesting that the MIS may not always align with intuitive notions of\ninterpretability, such as understanding the basis of correct decisions.\n","authors":["Jonathan von Rad","Florian Seuffert"],"pdf_url":"https://arxiv.org/pdf/2409.19727v2.pdf","comment":"4 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.07870v2","updated":"2025-01-15T02:23:10Z","published":"2025-01-14T06:21:31Z","title":"Make-A-Character 2: Animatable 3D Character Generation From a Single\n Image","summary":" This report introduces Make-A-Character 2, an advanced system for generating\nhigh-quality 3D characters from single portrait photographs, ideal for game\ndevelopment and digital human applications. Make-A-Character 2 builds upon its\npredecessor by incorporating several significant improvements for image-based\nhead generation. We utilize the IC-Light method to correct non-ideal\nillumination in input photos and apply neural network-based color correction to\nharmonize skin tones between the photos and game engine renders. We also employ\nthe Hierarchical Representation Network to capture high-frequency facial\nstructures and conduct adaptive skeleton calibration for accurate and\nexpressive facial animations. The entire image-to-3D-character generation\nprocess takes less than 2 minutes. Furthermore, we leverage transformer\narchitecture to generate co-speech facial and gesture actions, enabling\nreal-time conversation with the generated character. These technologies have\nbeen integrated into our conversational AI avatar products.\n","authors":["Lin Liu","Yutong Wang","Jiahao Chen","Jianfang Li","Tangli Xue","Longlong Li","Jianqiang Ren","Liefeng Bo"],"pdf_url":"https://arxiv.org/pdf/2501.07870v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2412.11409v3","updated":"2025-01-15T01:59:02Z","published":"2024-12-16T03:25:23Z","title":"Multi-modal and Multi-scale Spatial Environment Understanding for\n Immersive Visual Text-to-Speech","summary":" Visual Text-to-Speech (VTTS) aims to take the environmental image as the\nprompt to synthesize the reverberant speech for the spoken content. The\nchallenge of this task lies in understanding the spatial environment from the\nimage. Many attempts have been made to extract global spatial visual\ninformation from the RGB space of an spatial image. However, local and depth\nimage information are crucial for understanding the spatial environment, which\nprevious works have ignored. To address the issues, we propose a novel\nmulti-modal and multi-scale spatial environment understanding scheme to achieve\nimmersive VTTS, termed M2SE-VTTS. The multi-modal aims to take both the RGB and\nDepth spaces of the spatial image to learn more comprehensive spatial\ninformation, and the multi-scale seeks to model the local and global spatial\nknowledge simultaneously. Specifically, we first split the RGB and Depth images\ninto patches and adopt the Gemini-generated environment captions to guide the\nlocal spatial understanding. After that, the multi-modal and multi-scale\nfeatures are integrated by the local-aware global spatial understanding. In\nthis way, M2SE-VTTS effectively models the interactions between local and\nglobal spatial contexts in the multi-modal spatial environment. Objective and\nsubjective evaluations suggest that our model outperforms the advanced\nbaselines in environmental speech generation. The code and audio samples are\navailable at: https://github.com/AI-S2-Lab/M2SE-VTTS.\n","authors":["Rui Liu","Shuwei He","Yifan Hu","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2412.11409v3.pdf","comment":"9 pages,2 figures, Accepted by AAAI'2025"},{"id":"http://arxiv.org/abs/2501.08514v1","updated":"2025-01-15T01:52:54Z","published":"2025-01-15T01:52:54Z","title":"Multimodal Fake News Video Explanation Generation","summary":" Multi-modal explanation involves the assessment of the veracity of a variety\nof different content, and relies on multiple information modalities to\ncomprehensively consider the relevance and consistency between modalities. Most\nexisting fake news video detection methods focus on improving accuracy while\nignoring the importance of providing explanations. In this paper, we propose a\nnovel problem - Fake News Video Explanation (FNVE) - Given a multimodal news\ncontaining both video and caption text, we aim to generate natural language\nexplanations to reveal the truth of predictions. To this end, we develop\nFakeNVE, a new dataset of explanations for truthfully multimodal posts, where\neach explanation is a natural language (English) sentence describing the\nattribution of a news thread. We benchmark FakeNVE by using a multimodal\ntransformer-based architecture. Subsequently, a BART-based autoregressive\ndecoder is used as the generator. Empirical results show compelling results for\nvarious baselines (applicable to FNVE) across multiple evaluation metrics. We\nalso perform human evaluation on explanation generation, achieving high scores\nfor both adequacy and fluency.\n","authors":["Lizhi Chen","Zhong Qian","Peifeng Li","Qiaoming Zhu"],"pdf_url":"https://arxiv.org/pdf/2501.08514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04939v2","updated":"2025-01-15T01:12:29Z","published":"2025-01-09T03:04:08Z","title":"Multi-Context Temporal Consistent Modeling for Referring Video Object\n Segmentation","summary":" Referring video object segmentation aims to segment objects within a video\ncorresponding to a given text description. Existing transformer-based temporal\nmodeling approaches face challenges related to query inconsistency and the\nlimited consideration of context. Query inconsistency produces unstable masks\nof different objects in the middle of the video. The limited consideration of\ncontext leads to the segmentation of incorrect objects by failing to adequately\naccount for the relationship between the given text and instances. To address\nthese issues, we propose the Multi-context Temporal Consistency Module (MTCM),\nwhich consists of an Aligner and a Multi-Context Enhancer (MCE). The Aligner\nremoves noise from queries and aligns them to achieve query consistency. The\nMCE predicts text-relevant queries by considering multi-context. We applied\nMTCM to four different models, increasing performance across all of them,\nparticularly achieving 47.6 J&F on the MeViS. Code is available at\nhttps://github.com/Choi58/MTCM.\n","authors":["Sun-Hyuk Choi","Hayoung Jo","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2501.04939v2.pdf","comment":"Comment: Accepted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.08506v1","updated":"2025-01-15T00:56:59Z","published":"2025-01-15T00:56:59Z","title":"Exploring the Efficacy of Meta-Learning: Unveiling Superior Data\n Diversity Utilization of MAML Over Pre-training","summary":" Currently, data and model size dominate the narrative in the training of\nsuper-large, powerful models. However, there has been a lack of exploration on\nthe effect of other attributes of the training dataset on model performance. We\nhypothesize that dataset diversity can impact the performance of vision models.\nOur study shows positive correlations between test set accuracy and data\ndiversity, providing an argument for furthering the research of dataset\nattributes beyond size. We analyzed pre-training and model-agnostic\nmeta-learning methods on twelve popular visual datasets (e.g., Omniglot,\nCIFAR-FS, Aircraft) and five model configurations, including MAML variants with\ndifferent numbers of inner gradient steps and supervised learning. We show\nmoderate to strong positive correlations (R-squared: 0.15-0.42) between\naccuracy and data diversity and weaker but significant correlations (R-squared:\n~0.2) between loss and diversity. These findings support our hypothesis and\ndemonstrate a promising way for a deeper exploration of how formal data\ndiversity influences model performance. This initial study highlights the\npotential of (Task2Vec) data diversity as a valuable measure in the rapidly\nevolving field of large-scale learning and emphasizes that understanding the\ndataset is key to building more powerful and generalizable models.\n","authors":["Kavita Selva","Satita Vittayaareekul","Brando Miranda"],"pdf_url":"https://arxiv.org/pdf/2501.08506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07163v2","updated":"2025-01-15T00:54:54Z","published":"2025-01-13T09:49:34Z","title":"Adaptive Noise-Tolerant Network for Image Segmentation","summary":" Unlike image classification and annotation, for which deep network models\nhave achieved dominating superior performances compared to traditional computer\nvision algorithms, deep learning for automatic image segmentation still faces\ncritical challenges. One of such hurdles is to obtain ground-truth\nsegmentations as the training labels for deep network training. Especially when\nwe study biomedical images, such as histopathological images (histo-images), it\nis unrealistic to ask for manual segmentation labels as the ground truth for\ntraining due to the fine image resolution as well as the large image size and\ncomplexity. In this paper, instead of relying on clean segmentation labels, we\nstudy whether and how integrating imperfect or noisy segmentation results from\noff-the-shelf segmentation algorithms may help achieve better segmentation\nresults through a new Adaptive Noise-Tolerant Network (ANTN) model. We extend\nthe noisy label deep learning to image segmentation with two novel aspects: (1)\nmultiple noisy labels can be integrated into one deep learning model; (2) noisy\nsegmentation modeling, including probabilistic parameters, is adaptive,\ndepending on the given testing image appearance. Implementation of the new ANTN\nmodel on both the synthetic data and real-world histo-images demonstrates its\neffectiveness and superiority over off-the-shelf and other existing\ndeep-learning-based image segmentation algorithms.\n","authors":["Weizhi Li"],"pdf_url":"https://arxiv.org/pdf/2501.07163v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08505v1","updated":"2025-01-15T00:54:33Z","published":"2025-01-15T00:54:33Z","title":"Yuan: Yielding Unblemished Aesthetics Through A Unified Network for\n Visual Imperfections Removal in Generated Images","summary":" Generative AI presents transformative potential across various domains, from\ncreative arts to scientific visualization. However, the utility of AI-generated\nimagery is often compromised by visual flaws, including anatomical\ninaccuracies, improper object placements, and misplaced textual elements. These\nimperfections pose significant challenges for practical applications. To\novercome these limitations, we introduce \\textit{Yuan}, a novel framework that\nautonomously corrects visual imperfections in text-to-image synthesis.\n\\textit{Yuan} uniquely conditions on both the textual prompt and the segmented\nimage, generating precise masks that identify areas in need of refinement\nwithout requiring manual intervention -- a common constraint in previous\nmethodologies. Following the automated masking process, an advanced inpainting\nmodule seamlessly integrates contextually coherent content into the identified\nregions, preserving the integrity and fidelity of the original image and\nassociated text prompts. Through extensive experimentation on publicly\navailable datasets such as ImageNet100 and Stanford Dogs, along with a\ncustom-generated dataset, \\textit{Yuan} demonstrated superior performance in\neliminating visual imperfections. Our approach consistently achieved higher\nscores in quantitative metrics, including NIQE, BRISQUE, and PI, alongside\nfavorable qualitative evaluations. These results underscore \\textit{Yuan}'s\npotential to significantly enhance the quality and applicability of\nAI-generated images across diverse fields.\n","authors":["Zhenyu Yu","Chee Seng Chan"],"pdf_url":"https://arxiv.org/pdf/2501.08505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08504v1","updated":"2025-01-15T00:54:12Z","published":"2025-01-15T00:54:12Z","title":"SuperSAM: Crafting a SAM Supernetwork via Structured Pruning and\n Unstructured Parameter Prioritization","summary":" Neural Architecture Search (NAS) is a powerful approach of automating the\ndesign of efficient neural architectures. In contrast to traditional NAS\nmethods, recently proposed one-shot NAS methods prove to be more efficient in\nperforming NAS. One-shot NAS works by generating a singular weight-sharing\nsupernetwork that acts as a search space (container) of subnetworks. Despite\nits achievements, designing the one-shot search space remains a major\nchallenge. In this work we propose a search space design strategy for Vision\nTransformer (ViT)-based architectures. In particular, we convert the Segment\nAnything Model (SAM) into a weight-sharing supernetwork called SuperSAM. Our\napproach involves automating the search space design via layer-wise structured\npruning and parameter prioritization. While the structured pruning applies\nprobabilistic removal of certain transformer layers, parameter prioritization\nperforms weight reordering and slicing of MLP-blocks in the remaining layers.\nWe train supernetworks on several datasets using the sandwich rule. For\ndeployment, we enhance subnetwork discovery by utilizing a program autotuner to\nidentify efficient subnetworks within the search space. The resulting\nsubnetworks are 30-70% smaller in size compared to the original pre-trained SAM\nViT-B, yet outperform the pretrained model. Our work introduces a new and\neffective method for ViT NAS search-space design.\n","authors":["Waqwoya Abebe","Sadegh Jafari","Sixing Yu","Akash Dutta","Jan Strube","Nathan R. Tallent","Luanzheng Guo","Pablo Munoz","Ali Jannesari"],"pdf_url":"https://arxiv.org/pdf/2501.08504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.14762v3","updated":"2025-01-15T00:53:38Z","published":"2024-11-22T06:50:44Z","title":"Efficient Long Video Tokenization via Coordinate-based Patch\n Reconstruction","summary":" Efficient tokenization of videos remains a challenge in training vision\nmodels that can process long videos. One promising direction is to develop a\ntokenizer that can encode long video clips, as it would enable the tokenizer to\nleverage the temporal coherence of videos better for tokenization. However,\ntraining existing tokenizers on long videos often incurs a huge training cost\nas they are trained to reconstruct all the frames at once. In this paper, we\nintroduce CoordTok, a video tokenizer that learns a mapping from\ncoordinate-based representations to the corresponding patches of input videos,\ninspired by recent advances in 3D generative models. In particular, CoordTok\nencodes a video into factorized triplane representations and reconstructs\npatches that correspond to randomly sampled $(x,y,t)$ coordinates. This allows\nfor training large tokenizer models directly on long videos without requiring\nexcessive training resources. Our experiments show that CoordTok can\ndrastically reduce the number of tokens for encoding long video clips. For\ninstance, CoordTok can encode a 128-frame video with 128$\\times$128 resolution\ninto 1280 tokens, while baselines need 6144 or 8192 tokens to achieve similar\nreconstruction quality. We further show that this efficient video tokenization\nenables memory-efficient training of a diffusion transformer that can generate\n128 frames at once.\n","authors":["Huiwon Jang","Sihyun Yu","Jinwoo Shin","Pieter Abbeel","Younggyo Seo"],"pdf_url":"https://arxiv.org/pdf/2411.14762v3.pdf","comment":"Code is available on the project webpage:\n https://huiwon-jang.github.io/coordtok/"},{"id":"http://arxiv.org/abs/2412.14340v2","updated":"2025-01-15T00:02:00Z","published":"2024-12-18T21:17:02Z","title":"A Unifying Information-theoretic Perspective on Evaluating Generative\n Models","summary":" Considering the difficulty of interpreting generative model output, there is\nsignificant current research focused on determining meaningful evaluation\nmetrics. Several recent approaches utilize \"precision\" and \"recall,\" borrowed\nfrom the classification domain, to individually quantify the output fidelity\n(realism) and output diversity (representation of the real data variation),\nrespectively. With the increase in metric proposals, there is a need for a\nunifying perspective, allowing for easier comparison and clearer explanation of\ntheir benefits and drawbacks. To this end, we unify a class of\nkth-nearest-neighbors (kNN)-based metrics under an information-theoretic lens\nusing approaches from kNN density estimation. Additionally, we propose a\ntri-dimensional metric composed of Precision Cross-Entropy (PCE), Recall\nCross-Entropy (RCE), and Recall Entropy (RE), which separately measure fidelity\nand two distinct aspects of diversity, inter- and intra-class. Our\ndomain-agnostic metric, derived from the information-theoretic concepts of\nentropy and cross-entropy, can be dissected for both sample- and mode-level\nanalysis. Our detailed experimental results demonstrate the sensitivity of our\nmetric components to their respective qualities and reveal undesirable\nbehaviors of other metrics.\n","authors":["Alexis Fox","Samarth Swarup","Abhijin Adiga"],"pdf_url":"https://arxiv.org/pdf/2412.14340v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09203v1","updated":"2025-01-15T23:36:05Z","published":"2025-01-15T23:36:05Z","title":"Unified Few-shot Crack Segmentation and its Precise 3D Automatic\n Measurement in Concrete Structures","summary":" Visual-Spatial Systems has become increasingly essential in concrete crack\ninspection. However, existing methods often lacks adaptability to diverse\nscenarios, exhibits limited robustness in image-based approaches, and struggles\nwith curved or complex geometries. To address these limitations, an innovative\nframework for two-dimensional (2D) crack detection, three-dimensional (3D)\nreconstruction, and 3D automatic crack measurement was proposed by integrating\ncomputer vision technologies and multi-modal Simultaneous localization and\nmapping (SLAM) in this study. Firstly, building on a base DeepLabv3+\nsegmentation model, and incorporating specific refinements utilizing foundation\nmodel Segment Anything Model (SAM), we developed a crack segmentation method\nwith strong generalization across unfamiliar scenarios, enabling the generation\nof precise 2D crack masks. To enhance the accuracy and robustness of 3D\nreconstruction, Light Detection and Ranging (LiDAR) point clouds were utilized\ntogether with image data and segmentation masks. By leveraging both image- and\nLiDAR-SLAM, we developed a multi-frame and multi-modal fusion framework that\nproduces dense, colorized point clouds, effectively capturing crack semantics\nat a 3D real-world scale. Furthermore, the crack geometric attributions were\nmeasured automatically and directly within 3D dense point cloud space,\nsurpassing the limitations of conventional 2D image-based measurements. This\nadvancement makes the method suitable for structural components with curved and\ncomplex 3D geometries. Experimental results across various concrete structures\nhighlight the significant improvements and unique advantages of the proposed\nmethod, demonstrating its effectiveness, accuracy, and robustness in real-world\napplications.\n","authors":["Pengru Deng","Jiapeng Yao","Chun Li","Su Wang","Xinrun Li","Varun Ojha","Xuhui He","Takashi Matsumoto"],"pdf_url":"https://arxiv.org/pdf/2501.09203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09520v2","updated":"2025-01-15T23:21:06Z","published":"2024-09-14T20:11:25Z","title":"Enhancing Skin Disease Diagnosis: Interpretable Visual Concept Discovery\n with SAM","summary":" Current AI-assisted skin image diagnosis has achieved dermatologist-level\nperformance in classifying skin cancer, driven by rapid advancements in deep\nlearning architectures. However, unlike traditional vision tasks, skin images\nin general present unique challenges due to the limited availability of\nwell-annotated datasets, complex variations in conditions, and the necessity\nfor detailed interpretations to ensure patient safety. Previous segmentation\nmethods have sought to reduce image noise and enhance diagnostic performance,\nbut these techniques require fine-grained, pixel-level ground truth masks for\ntraining. In contrast, with the rise of foundation models, the Segment Anything\nModel (SAM) has been introduced to facilitate promptable segmentation, enabling\nthe automation of the segmentation process with simple yet effective prompts.\nEfforts applying SAM predominantly focus on dermatoscopy images, which present\nmore easily identifiable lesion boundaries than clinical photos taken with\nsmartphones. This limitation constrains the practicality of these approaches to\nreal-world applications. To overcome the challenges posed by noisy clinical\nphotos acquired via non-standardized protocols and to improve diagnostic\naccessibility, we propose a novel Cross-Attentive Fusion framework for\ninterpretable skin lesion diagnosis. Our method leverages SAM to generate\nvisual concepts for skin diseases using prompts, integrating local visual\nconcepts with global image features to enhance model performance. Extensive\nevaluation on two skin disease datasets demonstrates our proposed method's\neffectiveness on lesion diagnosis and interpretability.\n","authors":["Xin Hu","Janet Wang","Jihun Hamm","Rie R Yotsu","Zhengming Ding"],"pdf_url":"https://arxiv.org/pdf/2409.09520v2.pdf","comment":"This paper is accepted by WACV 2025"},{"id":"http://arxiv.org/abs/2501.09194v1","updated":"2025-01-15T22:55:26Z","published":"2025-01-15T22:55:26Z","title":"Grounding Text-To-Image Diffusion Models For Controlled High-Quality\n Image Generation","summary":" Large-scale text-to-image (T2I) diffusion models have demonstrated an\noutstanding performance in synthesizing diverse high-quality visuals from\nnatural language text captions. Multiple layout-to-image models have been\ndeveloped to control the generation process by utilizing a broad array of\nlayouts such as segmentation maps, edges, and human keypoints. In this work, we\npresent ObjectDiffusion, a model that takes inspirations from the top\ncutting-edge image generative frameworks to seamlessly condition T2I models\nwith new bounding boxes capabilities. Specifically, we make substantial\nmodifications to the network architecture introduced in ContorlNet to integrate\nit with the condition processing and injection techniques proposed in GLIGEN.\nObjectDiffusion is initialized with pretraining parameters to leverage the\ngeneration knowledge obtained from training on large-scale datasets. We\nfine-tune ObjectDiffusion on the COCO2017 training dataset and evaluate it on\nthe COCO2017 validation dataset. Our model achieves an AP$_{50}$ of 46.6, an AR\nof 44.5, and a FID of 19.8 outperforming the current SOTA model trained on\nopen-source datasets in all of the three metrics. ObjectDiffusion demonstrates\na distinctive capability in synthesizing diverse, high-quality, high-fidelity\nimages that seamlessly conform to the semantic and spatial control layout.\nEvaluated in qualitative and quantitative tests, ObjectDiffusion exhibits\nremarkable grounding abilities on closed-set and open-set settings across a\nwide variety of contexts. The qualitative assessment verifies the ability of\nObjectDiffusion to generate multiple objects of different sizes and locations.\n","authors":["Ahmad Süleyman","Göksel Biricik"],"pdf_url":"https://arxiv.org/pdf/2501.09194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09187v1","updated":"2025-01-15T22:26:26Z","published":"2025-01-15T22:26:26Z","title":"Patch-aware Vector Quantized Codebook Learning for Unsupervised Visual\n Defect Detection","summary":" Unsupervised visual defect detection is critical in industrial applications,\nrequiring a representation space that captures normal data features while\ndetecting deviations. Achieving a balance between expressiveness and\ncompactness is challenging; an overly expressive space risks inefficiency and\nmode collapse, impairing detection accuracy. We propose a novel approach using\nan enhanced VQ-VAE framework optimized for unsupervised defect detection. Our\nmodel introduces a patch-aware dynamic code assignment scheme, enabling\ncontext-sensitive code allocation to optimize spatial representation. This\nstrategy enhances normal-defect distinction and improves detection accuracy\nduring inference. Experiments on MVTecAD, BTAD, and MTSD datasets show our\nmethod achieves state-of-the-art performance.\n","authors":["Qisen Cheng","Shuhui Qu","Janghwan Lee"],"pdf_url":"https://arxiv.org/pdf/2501.09187v1.pdf","comment":"7 pages, Accepted to 36th IEEE ICTAI 2024"},{"id":"http://arxiv.org/abs/2501.09185v1","updated":"2025-01-15T22:23:41Z","published":"2025-01-15T22:23:41Z","title":"Cancer-Net PCa-Seg: Benchmarking Deep Learning Models for Prostate\n Cancer Segmentation Using Synthetic Correlated Diffusion Imaging","summary":" Prostate cancer (PCa) is the most prevalent cancer among men in the United\nStates, accounting for nearly 300,000 cases, 29% of all diagnoses and 35,000\ntotal deaths in 2024. Traditional screening methods such as prostate-specific\nantigen (PSA) testing and magnetic resonance imaging (MRI) have been pivotal in\ndiagnosis, but have faced limitations in specificity and generalizability. In\nthis paper, we explore the potential of enhancing PCa lesion segmentation using\na novel MRI modality called synthetic correlated diffusion imaging (CDI$^s$).\nWe employ several state-of-the-art deep learning models, including U-Net,\nSegResNet, Swin UNETR, Attention U-Net, and LightM-UNet, to segment PCa lesions\nfrom a 200 CDI$^s$ patient cohort. We find that SegResNet achieved superior\nsegmentation performance with a Dice-Sorensen coefficient (DSC) of $76.68 \\pm\n0.8$. Notably, the Attention U-Net, while slightly less accurate (DSC $74.82\n\\pm 2.0$), offered a favorable balance between accuracy and computational\nefficiency. Our findings demonstrate the potential of deep learning models in\nimproving PCa lesion segmentation using CDI$^s$ to enhance PCa management and\nclinical support.\n","authors":["Jarett Dewbury","Chi-en Amy Tai","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2501.09185v1.pdf","comment":"8 pages, 2 figures, to be published in Studies in Computational\n Intelligence. This paper introduces Cancer-Net PCa-Seg, a comprehensive\n evaluation of deep learning models for prostate cancer segmentation using\n synthetic correlated diffusion imaging (CDI$^s$). We benchmark five\n state-of-the-art architectures: U-Net, SegResNet, Swin UNETR, Attention\n U-Net, and LightM-UNet"},{"id":"http://arxiv.org/abs/2312.11458v3","updated":"2025-01-15T22:17:24Z","published":"2023-12-18T18:59:03Z","title":"GauFRe: Gaussian Deformation Fields for Real-time Dynamic Novel View\n Synthesis","summary":" We propose a method that achieves state-of-the-art rendering quality and\nefficiency on monocular dynamic scene reconstruction using deformable 3D\nGaussians. Implicit deformable representations commonly model motion with a\ncanonical space and time-dependent backward-warping deformation field. Our\nmethod, GauFRe, uses a forward-warping deformation to explicitly model\nnon-rigid transformations of scene geometry. Specifically, we propose a\ntemplate set of 3D Gaussians residing in a canonical space, and a\ntime-dependent forward-warping deformation field to model dynamic objects.\nAdditionally, we tailor a 3D Gaussian-specific static component supported by an\ninductive bias-aware initialization approach which allows the deformation field\nto focus on moving scene regions, improving the rendering of complex real-world\nmotion. The differentiable pipeline is optimized end-to-end with a\nself-supervised rendering loss. Experiments show our method achieves\ncompetitive results and higher efficiency than both previous state-of-the-art\nNeRF and Gaussian-based methods. For real-world scenes, GauFRe can train in ~20\nmins and offer 96 FPS real-time rendering on an RTX 3090 GPU. Project website:\nhttps://lynl7130.github.io/gaufre/index.html\n","authors":["Yiqing Liang","Numair Khan","Zhengqin Li","Thu Nguyen-Phuoc","Douglas Lanman","James Tompkin","Lei Xiao"],"pdf_url":"https://arxiv.org/pdf/2312.11458v3.pdf","comment":"WACV 2025. 11 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2501.09167v1","updated":"2025-01-15T21:36:19Z","published":"2025-01-15T21:36:19Z","title":"Embodied Scene Understanding for Vision Language Models via MetaVQA","summary":" Vision Language Models (VLMs) demonstrate significant potential as embodied\nAI agents for various mobility applications. However, a standardized,\nclosed-loop benchmark for evaluating their spatial reasoning and sequential\ndecision-making capabilities is lacking. To address this, we present MetaVQA: a\ncomprehensive benchmark designed to assess and enhance VLMs' understanding of\nspatial relationships and scene dynamics through Visual Question Answering\n(VQA) and closed-loop simulations. MetaVQA leverages Set-of-Mark prompting and\ntop-down view ground-truth annotations from nuScenes and Waymo datasets to\nautomatically generate extensive question-answer pairs based on diverse\nreal-world traffic scenarios, ensuring object-centric and context-rich\ninstructions. Our experiments show that fine-tuning VLMs with the MetaVQA\ndataset significantly improves their spatial reasoning and embodied scene\ncomprehension in safety-critical simulations, evident not only in improved VQA\naccuracies but also in emerging safety-aware driving maneuvers. In addition,\nthe learning demonstrates strong transferability from simulation to real-world\nobservation. Code and data will be publicly available at\nhttps://metadriverse.github.io/metavqa .\n","authors":["Weizhen Wang","Chenda Duan","Zhenghao Peng","Yuxin Liu","Bolei Zhou"],"pdf_url":"https://arxiv.org/pdf/2501.09167v1.pdf","comment":"for the project webpage, see https://metadriverse.github.io/metavqa"},{"id":"http://arxiv.org/abs/2501.09162v1","updated":"2025-01-15T21:28:47Z","published":"2025-01-15T21:28:47Z","title":"A Vessel Bifurcation Landmark Pair Dataset for Abdominal CT Deformable\n Image Registration (DIR) Validation","summary":" Deformable image registration (DIR) is an enabling technology in many\ndiagnostic and therapeutic tasks. Despite this, DIR algorithms have limited\nclinical use, largely due to a lack of benchmark datasets for quality assurance\nduring development. To support future algorithm development, here we introduce\nour first-of-its-kind abdominal CT DIR benchmark dataset, comprising large\nnumbers of highly accurate landmark pairs on matching blood vessel\nbifurcations. Abdominal CT image pairs of 30 patients were acquired from\nseveral public repositories as well as the authors' institution with IRB\napproval. The two CTs of each pair were originally acquired for the same\npatient on different days. An image processing workflow was developed and\napplied to each image pair: 1) Abdominal organs were segmented with a deep\nlearning model, and image intensity within organ masks was overwritten. 2)\nMatching image patches were manually identified between two CTs of each image\npair 3) Vessel bifurcation landmarks were labeled on one image of each image\npatch pair. 4) Image patches were deformably registered, and landmarks were\nprojected onto the second image. 5) Landmark pair locations were refined\nmanually or with an automated process. This workflow resulted in 1895 total\nlandmark pairs, or 63 per case on average. Estimates of the landmark pair\naccuracy using digital phantoms were 0.7+/-1.2mm. The data is published in\nZenodo at https://doi.org/10.5281/zenodo.14362785. Instructions for use can be\nfound at https://github.com/deshanyang/Abdominal-DIR-QA. This dataset is a\nfirst-of-its-kind for abdominal DIR validation. The number, accuracy, and\ndistribution of landmark pairs will allow for robust validation of DIR\nalgorithms with precision beyond what is currently available.\n","authors":["Edward R Criscuolo","Yao Hao","Zhendong Zhang","Trevor McKeown","Deshan Yang"],"pdf_url":"https://arxiv.org/pdf/2501.09162v1.pdf","comment":"19 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.09155v1","updated":"2025-01-15T21:14:36Z","published":"2025-01-15T21:14:36Z","title":"VCRScore: Image captioning metric based on V\\&L Transformers, CLIP, and\n precision-recall","summary":" Image captioning has become an essential Vision & Language research task. It\nis about predicting the most accurate caption given a specific image or video.\nThe research community has achieved impressive results by continuously\nproposing new models and approaches to improve the overall model's performance.\nNevertheless, despite increasing proposals, the performance metrics used to\nmeasure their advances have remained practically untouched through the years. A\nprobe of that, nowadays metrics like BLEU, METEOR, CIDEr, and ROUGE are still\nvery used, aside from more sophisticated metrics such as BertScore and\nClipScore.\n Hence, it is essential to adjust how are measure the advances, limitations,\nand scopes of the new image captioning proposals, as well as to adapt new\nmetrics to these new advanced image captioning approaches.\n This work proposes a new evaluation metric for the image captioning problem.\nTo do that, first, it was generated a human-labeled dataset to assess to which\ndegree the captions correlate with the image's content. Taking these human\nscores as ground truth, we propose a new metric, and compare it with several\nwell-known metrics, from classical to newer ones. Outperformed results were\nalso found, and interesting insights were presented and discussed.\n","authors":["Guillermo Ruiz","Tania Ramírez","Daniela Moctezuma"],"pdf_url":"https://arxiv.org/pdf/2501.09155v1.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2302.13336v2","updated":"2025-01-15T20:50:17Z","published":"2023-02-26T15:45:19Z","title":"Key-Exchange Convolutional Auto-Encoder for Data Augmentation in Early\n Knee Osteoarthritis Detection","summary":" Knee Osteoarthritis (KOA) is a common musculoskeletal condition that\nsignificantly affects mobility and quality of life, particularly in elderly\npopulations. However, training deep learning models for early KOA\nclassification is often hampered by the limited availability of annotated\nmedical datasets, owing to the high costs and labour-intensive nature of data\nlabelling. Traditional data augmentation techniques, while useful, rely on\nsimple transformations and fail to introduce sufficient diversity into the\ndataset. To address these challenges, we propose the Key-Exchange Convolutional\nAuto-Encoder (KECAE) as an innovative Artificial Intelligence (AI)-based data\naugmentation strategy for early KOA classification. Our model employs a\nconvolutional autoencoder with a novel key-exchange mechanism that generates\nsynthetic images by selectively exchanging key pathological features between\nX-ray images, which not only diversifies the dataset but also ensures the\nclinical validity of the augmented data. A hybrid loss function is introduced\nto supervise feature learning and reconstruction, integrating multiple\ncomponents, including reconstruction, supervision, and feature separation\nlosses. Experimental results demonstrate that the KECAE-generated data\nsignificantly improve the performance of KOA classification models, with\naccuracy gains of up to 1.98% across various standard and state-of-the-art\narchitectures. Furthermore, a clinical validation study involving expert\nradiologists confirms the anatomical plausibility and diagnostic realism of the\nsynthetic outputs. These findings highlight the potential of KECAE as a robust\ntool for augmenting medical datasets in early KOA detection.\n","authors":["Zhe Wang","Aladine Chetouani","Mohamed Jarraya","Yung Hsin Chen","Yuhua Ru","Fang Chen","Fabian Bauer","Liping Zhang","Didier Hans","Rachid Jennane"],"pdf_url":"https://arxiv.org/pdf/2302.13336v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.12141v2","updated":"2025-01-15T20:44:23Z","published":"2022-12-23T04:31:20Z","title":"Human Activity Recognition in an Open World","summary":" Managing novelty in perception-based human activity recognition (HAR) is\ncritical in realistic settings to improve task performance over time and ensure\nsolution generalization outside of prior seen samples. Novelty manifests in HAR\nas unseen samples, activities, objects, environments, and sensor changes, among\nother ways. Novelty may be task-relevant, such as a new class or new features,\nor task-irrelevant resulting in nuisance novelty, such as never before seen\nnoise, blur, or distorted video recordings. To perform HAR optimally,\nalgorithmic solutions must be tolerant to nuisance novelty, and learn over time\nin the face of novelty. This paper 1) formalizes the definition of novelty in\nHAR building upon the prior definition of novelty in classification tasks, 2)\nproposes an incremental open world learning (OWL) protocol and applies it to\nthe Kinetics datasets to generate a new benchmark KOWL-718, 3) analyzes the\nperformance of current state-of-the-art HAR models when novelty is introduced\nover time, 4) provides a containerized and packaged pipeline for reproducing\nthe OWL protocol and for modifying for any future updates to Kinetics. The\nexperimental analysis includes an ablation study of how the different models\nperform under various conditions as annotated by Kinetics-AVA. The protocol as\nan algorithm for reproducing experiments using the KOWL-718 benchmark will be\npublicly released with code and containers at\nhttps://github.com/prijatelj/human-activity-recognition-in-an-open-world. The\ncode may be used to analyze different annotations and subsets of the Kinetics\ndatasets in an incremental open world fashion, as well as be extended as\nfurther updates to Kinetics are released.\n","authors":["Derek S. Prijatelj","Samuel Grieggs","Jin Huang","Dawei Du","Ameya Shringi","Christopher Funk","Adam Kaufman","Eric Robertson","Walter J. Scheirer"],"pdf_url":"https://arxiv.org/pdf/2212.12141v2.pdf","comment":"37 pages, 16 figures, 3 tables. Published in JAIR 81 on Dec 20, 2024.\n All author affiliations are from during the paper's original funded work.\n Updated info and current emails are provided in this version's first page"},{"id":"http://arxiv.org/abs/2501.09138v1","updated":"2025-01-15T20:44:21Z","published":"2025-01-15T20:44:21Z","title":"Few-Shot Adaptation of Training-Free Foundation Model for 3D Medical\n Image Segmentation","summary":" Vision foundation models have achieved remarkable progress across various\nimage analysis tasks. In the image segmentation task, foundation models like\nthe Segment Anything Model (SAM) enable generalizable zero-shot segmentation\nthrough user-provided prompts. However, SAM primarily trained on natural\nimages, lacks the domain-specific expertise of medical imaging. This limitation\nposes challenges when applying SAM to medical image segmentation, including the\nneed for extensive fine-tuning on specialized medical datasets and a dependency\non manual prompts, which are both labor-intensive and require intervention from\nmedical experts.\n This work introduces the Few-shot Adaptation of Training-frEe SAM (FATE-SAM),\na novel method designed to adapt the advanced Segment Anything Model 2 (SAM2)\nfor 3D medical image segmentation. FATE-SAM reassembles pre-trained modules of\nSAM2 to enable few-shot adaptation, leveraging a small number of support\nexamples to capture anatomical knowledge and perform prompt-free segmentation,\nwithout requiring model fine-tuning. To handle the volumetric nature of medical\nimages, we incorporate a Volumetric Consistency mechanism that enhances spatial\ncoherence across 3D slices. We evaluate FATE-SAM on multiple medical imaging\ndatasets and compare it with supervised learning methods, zero-shot SAM\napproaches, and fine-tuned medical SAM methods. Results show that FATE-SAM\ndelivers robust and accurate segmentation while eliminating the need for large\nannotated datasets and expert intervention. FATE-SAM provides a practical,\nefficient solution for medical image segmentation, making it more accessible\nfor clinical applications.\n","authors":["Xingxin He","Yifan Hu","Zhaoye Zhou","Mohamed Jarraya","Fang Liu"],"pdf_url":"https://arxiv.org/pdf/2501.09138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13203v2","updated":"2025-01-15T20:41:42Z","published":"2023-03-23T11:57:50Z","title":"Confidence-Driven Deep Learning Framework for Early Detection of Knee\n Osteoarthritis","summary":" Knee Osteoarthritis (KOA) is a prevalent musculoskeletal disorder that\nseverely impacts mobility and quality of life, particularly among older adults.\nIts diagnosis often relies on subjective assessments using the\nKellgren-Lawrence (KL) grading system, leading to variability in clinical\nevaluations. To address these challenges, we propose a confidence-driven deep\nlearning framework for early KOA detection, focusing on distinguishing KL-0 and\nKL-2 stages. The Siamese-based framework integrates a novel multi-level feature\nextraction architecture with a hybrid loss strategy. Specifically, multi-level\nGlobal Average Pooling (GAP) layers are employed to extract features from\nvarying network depths, ensuring comprehensive feature representation, while\nthe hybrid loss strategy partitions training samples into high-, medium-, and\nlow-confidence subsets. Tailored loss functions are applied to improve model\nrobustness and effectively handle uncertainty in annotations. Experimental\nresults on the Osteoarthritis Initiative (OAI) dataset demonstrate that the\nproposed framework achieves competitive accuracy, sensitivity, and specificity,\ncomparable to those of expert radiologists. Cohen's kappa values (k > 0.85))\nconfirm substantial agreement, while McNemar's test (p > 0.05) indicates no\nstatistically significant differences between the model and radiologists.\nAdditionally, Confidence distribution analysis reveals that the model emulates\nradiologists' decision-making patterns. These findings highlight the potential\nof the proposed approach to serve as an auxiliary diagnostic tool, enhancing\nearly KOA detection and reducing clinical workload.\n","authors":["Zhe Wang","Aladine Chetouani","Yung Hsin Chen","Yuhua Ru","Fang Chen","Mohamed Jarraya","Fabian Bauer","Liping Zhang","Didier Hans","Rachid Jennane"],"pdf_url":"https://arxiv.org/pdf/2303.13203v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09134v1","updated":"2025-01-15T20:37:04Z","published":"2025-01-15T20:37:04Z","title":"Benchmarking Robustness of Contrastive Learning Models for Medical\n Image-Report Retrieval","summary":" Medical images and reports offer invaluable insights into patient health. The\nheterogeneity and complexity of these data hinder effective analysis. To bridge\nthis gap, we investigate contrastive learning models for cross-domain\nretrieval, which associates medical images with their corresponding clinical\nreports. This study benchmarks the robustness of four state-of-the-art\ncontrastive learning models: CLIP, CXR-RePaiR, MedCLIP, and CXR-CLIP. We\nintroduce an occlusion retrieval task to evaluate model performance under\nvarying levels of image corruption. Our findings reveal that all evaluated\nmodels are highly sensitive to out-of-distribution data, as evidenced by the\nproportional decrease in performance with increasing occlusion levels. While\nMedCLIP exhibits slightly more robustness, its overall performance remains\nsignificantly behind CXR-CLIP and CXR-RePaiR. CLIP, trained on a\ngeneral-purpose dataset, struggles with medical image-report retrieval,\nhighlighting the importance of domain-specific training data. The evaluation of\nthis work suggests that more effort needs to be spent on improving the\nrobustness of these models. By addressing these limitations, we can develop\nmore reliable cross-domain retrieval models for medical applications.\n","authors":["Demetrio Deanda","Yuktha Priya Masupalli","Jeong Yang","Young Lee","Zechun Cao","Gongbo Liang"],"pdf_url":"https://arxiv.org/pdf/2501.09134v1.pdf","comment":"This work is accepted to AAAI 2025 Workshop -- the 9th International\n Workshop on Health Intelligence"},{"id":"http://arxiv.org/abs/2501.09129v1","updated":"2025-01-15T20:24:18Z","published":"2025-01-15T20:24:18Z","title":"Deep Self-Supervised Disturbance Mapping with the OPERA Sentinel-1\n Radiometric Terrain Corrected SAR Backscatter Product","summary":" Mapping land surface disturbances supports disaster response, resource and\necosystem management, and climate adaptation efforts. Synthetic aperture radar\n(SAR) is an invaluable tool for disturbance mapping, providing consistent\ntime-series images of the ground regardless of weather or illumination\nconditions. Despite SAR's potential for disturbance mapping, processing SAR\ndata to an analysis-ready format requires expertise and significant compute\nresources, particularly for large-scale global analysis. In October 2023,\nNASA's Observational Products for End-Users from Remote Sensing Analysis\n(OPERA) project released the near-global Radiometric Terrain Corrected SAR\nbackscatter from Sentinel-1 (RTC-S1) dataset, providing publicly available,\nanalysis-ready SAR imagery. In this work, we utilize this new dataset to\nsystematically analyze land surface disturbances. As labeling SAR data is often\nprohibitively time-consuming, we train a self-supervised vision transformer -\nwhich requires no labels to train - on OPERA RTC-S1 data to estimate a\nper-pixel distribution from the set of baseline imagery and assess disturbances\nwhen there is significant deviation from the modeled distribution. To test our\nmodel's capability and generality, we evaluate three different natural\ndisasters - which represent high-intensity, abrupt disturbances - from three\ndifferent regions of the world. Across events, our approach yields high quality\ndelineations: F1 scores exceeding 0.6 and Areas Under the Precision-Recall\nCurve exceeding 0.65, consistently outperforming existing SAR disturbance\nmethods. Our findings suggest that a self-supervised vision transformer is\nwell-suited for global disturbance mapping and can be a valuable tool for\noperational, near-global disturbance monitoring, particularly when labeled data\ndoes not exist.\n","authors":["Harris Hardiman-Mostow","Charles Marshak","Alexander L. Handwerger"],"pdf_url":"https://arxiv.org/pdf/2501.09129v1.pdf","comment":"19 pages, 18 figures, 5 tables. Preprint. Submitted to JSTARS"},{"id":"http://arxiv.org/abs/2501.09116v1","updated":"2025-01-15T19:52:02Z","published":"2025-01-15T19:52:02Z","title":"Deep Distance Map Regression Network with Shape-aware Loss for\n Imbalanced Medical Image Segmentation","summary":" Small object segmentation, like tumor segmentation, is a difficult and\ncritical task in the field of medical image analysis. Although deep learning\nbased methods have achieved promising performance, they are restricted to the\nuse of binary segmentation mask. Inspired by the rigorous mapping between\nbinary segmentation mask and distance map, we adopt distance map as a novel\nground truth and employ a network to fulfill the computation of distance map.\nSpecially, we propose a new segmentation framework that incorporates the\nexisting binary segmentation network and a light weight regression network\n(dubbed as LR-Net). Thus, the LR-Net can convert the distance map computation\ninto a regression task and leverage the rich information of distance maps.\nAdditionally, we derive a shape-aware loss by employing distance maps as\npenalty map to infer the complete shape of an object. We evaluated our approach\non MICCAI 2017 Liver Tumor Segmentation (LiTS) Challenge dataset and a clinical\ndataset. Experimental results show that our approach outperforms the\nclassification-based methods as well as other existing state-of-the-arts.\n","authors":["Huiyu Li","Xiabi Liu","Said Boumaraf","Xiaopeng Gong","Donghai Liao","Xiaohong Ma"],"pdf_url":"https://arxiv.org/pdf/2501.09116v1.pdf","comment":"Conference"},{"id":"http://arxiv.org/abs/2501.09114v1","updated":"2025-01-15T19:50:56Z","published":"2025-01-15T19:50:56Z","title":"Generative Medical Image Anonymization Based on Latent Code Projection\n and Optimization","summary":" Medical image anonymization aims to protect patient privacy by removing\nidentifying information, while preserving the data utility to solve downstream\ntasks. In this paper, we address the medical image anonymization problem with a\ntwo-stage solution: latent code projection and optimization. In the projection\nstage, we design a streamlined encoder to project input images into a latent\nspace and propose a co-training scheme to enhance the projection process. In\nthe optimization stage, we refine the latent code using two deep loss functions\ndesigned to address the trade-off between identity protection and data utility\ndedicated to medical images. Through a comprehensive set of qualitative and\nquantitative experiments, we showcase the effectiveness of our approach on the\nMIMIC-CXR chest X-ray dataset by generating anonymized synthetic images that\ncan serve as training set for detecting lung pathologies. Source codes are\navailable at https://github.com/Huiyu-Li/GMIA.\n","authors":["Huiyu Li","Nicholas Ayache","Hervé Delingette"],"pdf_url":"https://arxiv.org/pdf/2501.09114v1.pdf","comment":"Conference"},{"id":"http://arxiv.org/abs/2501.09101v1","updated":"2025-01-15T19:37:18Z","published":"2025-01-15T19:37:18Z","title":"Relation U-Net","summary":" Towards clinical interpretations, this paper presents a new\n''output-with-confidence'' segmentation neural network with multiple input\nimages and multiple output segmentation maps and their pairwise relations. A\nconfidence score of the test image without ground-truth can be estimated from\nthe difference among the estimated relation maps. We evaluate the method based\non the widely used vanilla U-Net for segmentation and our new model is named\nRelation U-Net which can output segmentation maps of the input images as well\nas an estimated confidence score of the test image without ground-truth.\nExperimental results on four public datasets show that Relation U-Net can not\nonly provide better accuracy than vanilla U-Net but also estimate a confidence\nscore which is linearly correlated to the segmentation accuracy on test images.\n","authors":["Sheng He","Rina Bao","P. Ellen Grant","Yangming Ou"],"pdf_url":"https://arxiv.org/pdf/2501.09101v1.pdf","comment":"ISIB 2025"},{"id":"http://arxiv.org/abs/2501.09096v1","updated":"2025-01-15T19:29:31Z","published":"2025-01-15T19:29:31Z","title":"Self Pre-training with Adaptive Mask Autoencoders for Variable-Contrast\n 3D Medical Imaging","summary":" The Masked Autoencoder (MAE) has recently demonstrated effectiveness in\npre-training Vision Transformers (ViT) for analyzing natural images. By\nreconstructing complete images from partially masked inputs, the ViT encoder\ngathers contextual information to predict the missing regions. This capability\nto aggregate context is especially important in medical imaging, where\nanatomical structures are functionally and mechanically linked to surrounding\nregions. However, current methods do not consider variations in the number of\ninput images, which is typically the case in real-world Magnetic Resonance (MR)\nstudies. To address this limitation, we propose a 3D Adaptive Masked\nAutoencoders (AMAE) architecture that accommodates a variable number of 3D\ninput contrasts per subject. A magnetic resonance imaging (MRI) dataset of\n45,364 subjects was used for pretraining and a subset of 1648 training, 193\nvalidation and 215 test subjects were used for finetuning. The performance\ndemonstrates that self pre-training of this adaptive masked autoencoders can\nenhance the infarct segmentation performance by 2.8%-3.7% for ViT-based\nsegmentation models.\n","authors":["Badhan Kumar Das","Gengyan Zhao","Han Liu","Thomas J. Re","Dorin Comaniciu","Eli Gibson","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2501.09096v1.pdf","comment":"5 pages, ISBI 2025 accepted"},{"id":"http://arxiv.org/abs/2311.12068v4","updated":"2025-01-15T19:28:27Z","published":"2023-11-19T17:28:28Z","title":"Enhancing Novel Object Detection via Cooperative Foundational Models","summary":" In this work, we address the challenging and emergent problem of novel object\ndetection (NOD), focusing on the accurate detection of both known and novel\nobject categories during inference. Traditional object detection algorithms are\ninherently closed-set, limiting their capability to handle NOD. We present a\nnovel approach to transform existing closed-set detectors into open-set\ndetectors. This transformation is achieved by leveraging the complementary\nstrengths of pre-trained foundational models, specifically CLIP and SAM,\nthrough our cooperative mechanism. Furthermore, by integrating this mechanism\nwith state-of-the-art open-set detectors such as GDINO, we establish new\nbenchmarks in object detection performance. Our method achieves 17.42 mAP in\nnovel object detection and 42.08 mAP for known objects on the challenging LVIS\ndataset. Adapting our approach to the COCO OVD split, we surpass the current\nstate-of-the-art by a margin of 7.2 $ \\text{AP}_{50} $ for novel classes. Our\ncode is available at https://rohit901.github.io/coop-foundation-models/ .\n","authors":["Rohit Bharadwaj","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2311.12068v4.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2501.09086v1","updated":"2025-01-15T19:12:59Z","published":"2025-01-15T19:12:59Z","title":"Salient Information Preserving Adversarial Training Improves Clean and\n Robust Accuracy","summary":" In this work we introduce Salient Information Preserving Adversarial Training\n(SIP-AT), an intuitive method for relieving the robustness-accuracy trade-off\nincurred by traditional adversarial training. SIP-AT uses salient image regions\nto guide the adversarial training process in such a way that fragile features\ndeemed meaningful by an annotator remain unperturbed during training, allowing\nmodels to learn highly predictive non-robust features without sacrificing\noverall robustness. This technique is compatible with both human-based and\nautomatically generated salience estimates, allowing SIP-AT to be used as a\npart of human-driven model development without forcing SIP-AT to be reliant\nupon additional human data. We perform experiments across multiple datasets and\narchitectures and demonstrate that SIP-AT is able to boost the clean accuracy\nof models while maintaining a high degree of robustness against attacks at\nmultiple epsilon levels. We complement our central experiments with an\nobservational study measuring the rate at which human subjects successfully\nidentify perturbed images. This study helps build a more intuitive\nunderstanding of adversarial attack strength and demonstrates the heightened\nimportance of low-epsilon robustness. Our results demonstrate the efficacy of\nSIP-AT and provide valuable insight into the risks posed by adversarial samples\nof various strengths.\n","authors":["Timothy Redgrave","Adam Czajka"],"pdf_url":"https://arxiv.org/pdf/2501.09086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.19794v3","updated":"2025-01-15T19:04:48Z","published":"2024-12-27T18:47:05Z","title":"MVTamperBench: Evaluating Robustness of Vision-Language Models","summary":" Recent advancements in Vision-Language Models (VLMs) have enabled significant\nprogress in complex video understanding tasks. However, their robustness to\nreal-world manipulations remains underexplored, limiting their reliability in\ncritical applications. To address this gap, we introduce MVTamperBench, a\ncomprehensive benchmark designed to evaluate VLM's resilience to video\ntampering effects, including rotation, dropping, masking, substitution, and\nrepetition. By systematically assessing state-of-the-art models, MVTamperBench\nreveals substantial variability in robustness, with models like InternVL2-8B\nachieving high performance, while others, such as Llama-VILA1.5-8B, exhibit\nsevere vulnerabilities. To foster broader adoption and reproducibility,\nMVTamperBench is integrated into VLMEvalKit, a modular evaluation toolkit,\nenabling streamlined testing and facilitating advancements in model robustness.\nOur benchmark represents a critical step towards developing tamper-resilient\nVLMs, ensuring their dependability in real-world scenarios.\n Project Page: https://amitbcp.github.io/MVTamperBench/\n","authors":["Amit Agarwal","Srikant Panda","Angeline Charles","Bhargava Kumar","Hitesh Patel","Priyaranjan Pattnayak","Taki Hasan Rafi","Tejaswini Kumar","Dong-Kyu Chae"],"pdf_url":"https://arxiv.org/pdf/2412.19794v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09055v1","updated":"2025-01-15T18:39:03Z","published":"2025-01-15T18:39:03Z","title":"SHYI: Action Support for Contrastive Learning in High-Fidelity\n Text-to-Image Generation","summary":" In this project, we address the issue of infidelity in text-to-image\ngeneration, particularly for actions involving multiple objects. For this we\nbuild on top of the CONFORM framework which uses Contrastive Learning to\nimprove the accuracy of the generated image for multiple objects. However the\ndepiction of actions which involves multiple different object has still large\nroom for improvement. To improve, we employ semantically hypergraphic\ncontrastive adjacency learning, a comprehension of enhanced contrastive\nstructure and \"contrast but link\" technique. We further amend Stable\nDiffusion's understanding of actions by InteractDiffusion. As evaluation\nmetrics we use image-text similarity CLIP and TIFA. In addition, we conducted a\nuser study.\n Our method shows promising results even with verbs that Stable Diffusion\nunderstands mediocrely. We then provide future directions by analyzing the\nresults.\n Our codebase can be found on polybox under the link:\nhttps://polybox.ethz.ch/index.php/s/dJm3SWyRohUrFxn\n","authors":["Tianxiang Xia","Lin Xiao","Yannick Montorfani","Francesco Pavia","Enis Simsar","Thomas Hofmann"],"pdf_url":"https://arxiv.org/pdf/2501.09055v1.pdf","comment":"Main content 4 pages"}]},"2025-01-16T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.09757v1","updated":"2025-01-16T18:59:53Z","published":"2025-01-16T18:59:53Z","title":"Distilling Multi-modal Large Language Models for Autonomous Driving","summary":" Autonomous driving demands safe motion planning, especially in critical\n\"long-tail\" scenarios. Recent end-to-end autonomous driving systems leverage\nlarge language models (LLMs) as planners to improve generalizability to rare\nevents. However, using LLMs at test time introduces high computational costs.\nTo address this, we propose DiMA, an end-to-end autonomous driving system that\nmaintains the efficiency of an LLM-free (or vision-based) planner while\nleveraging the world knowledge of an LLM. DiMA distills the information from a\nmulti-modal LLM to a vision-based end-to-end planner through a set of specially\ndesigned surrogate tasks. Under a joint training strategy, a scene encoder\ncommon to both networks produces structured representations that are\nsemantically grounded as well as aligned to the final planning objective.\nNotably, the LLM is optional at inference, enabling robust planning without\ncompromising on efficiency. Training with DiMA results in a 37% reduction in\nthe L2 trajectory error and an 80% reduction in the collision rate of the\nvision-based planner, as well as a 44% trajectory error reduction in longtail\nscenarios. DiMA also achieves state-of-the-art performance on the nuScenes\nplanning benchmark.\n","authors":["Deepti Hegde","Rajeev Yasarla","Hong Cai","Shizhong Han","Apratim Bhattacharyya","Shweta Mahajan","Litian Liu","Risheek Garrepalli","Vishal M. Patel","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2501.09757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09747v1","updated":"2025-01-16T18:57:04Z","published":"2025-01-16T18:57:04Z","title":"FAST: Efficient Action Tokenization for Vision-Language-Action Models","summary":" Autoregressive sequence models, such as Transformer-based vision-language\naction (VLA) policies, can be tremendously effective for capturing complex and\ngeneralizable robotic behaviors. However, such models require us to choose a\ntokenization of our continuous action signals, which determines how the\ndiscrete symbols predicted by the model map to continuous robot actions. We\nfind that current approaches for robot action tokenization, based on simple\nper-dimension, per-timestep binning schemes, typically perform poorly when\nlearning dexterous skills from high-frequency robot data. To address this\nchallenge, we propose a new compression-based tokenization scheme for robot\nactions, based on the discrete cosine transform. Our tokenization approach,\nFrequency-space Action Sequence Tokenization (FAST), enables us to train\nautoregressive VLAs for highly dexterous and high-frequency tasks where\nstandard discretization methods fail completely. Based on FAST, we release\nFAST+, a universal robot action tokenizer, trained on 1M real robot action\ntrajectories. It can be used as a black-box tokenizer for a wide range of robot\naction sequences, with diverse action spaces and control frequencies. Finally,\nwe show that, when combined with the pi0 VLA, our method can scale to training\non 10k hours of robot data and match the performance of diffusion VLAs, while\nreducing training time by up to 5x.\n","authors":["Karl Pertsch","Kyle Stachowicz","Brian Ichter","Danny Driess","Suraj Nair","Quan Vuong","Oier Mees","Chelsea Finn","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2501.09747v1.pdf","comment":"Website: https://www.pi.website/research/fast"},{"id":"http://arxiv.org/abs/2501.09718v1","updated":"2025-01-16T18:06:09Z","published":"2025-01-16T18:06:09Z","title":"FLOL: Fast Baselines for Real-World Low-Light Enhancement","summary":" Low-Light Image Enhancement (LLIE) is a key task in computational photography\nand imaging. The problem of enhancing images captured during night or in dark\nenvironments has been well-studied in the image signal processing literature.\nHowever, current deep learning-based solutions struggle with efficiency and\nrobustness in real-world scenarios (e.g. scenes with noise, saturated pixels,\nbad illumination). We propose a lightweight neural network that combines image\nprocessing in the frequency and spatial domains. Our method, FLOL+, is one of\nthe fastest models for this task, achieving state-of-the-art results on popular\nreal scenes datasets such as LOL and LSRW. Moreover, we are able to process\n1080p images under 12ms. Code and models at https://github.com/cidautai/FLOL\n","authors":["Juan C. Benito","Daniel Feijoo","Alvaro Garcia","Marcos V. Conde"],"pdf_url":"https://arxiv.org/pdf/2501.09718v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2501.09680v1","updated":"2025-01-16T17:31:27Z","published":"2025-01-16T17:31:27Z","title":"CoNav Chair: Design of a ROS-based Smart Wheelchair for Shared Control\n Navigation in the Built Environment","summary":" With the number of people with disabilities (PWD) increasing worldwide each\nyear, the demand for mobility support to enable independent living and social\nintegration is also growing. Wheelchairs commonly support the mobility of PWD\nin both indoor and outdoor environments. However, current powered wheelchairs\n(PWC) often fail to meet the needs of PWD, who may find it difficult to operate\nthem. Furthermore, existing research on robotic wheelchairs typically focuses\neither on full autonomy or enhanced manual control, which can lead to reduced\nefficiency and user trust. To address these issues, this paper proposes a Robot\nOperating System (ROS)-based smart wheelchair, called CoNav Chair, that\nincorporates a shared control navigation algorithm and obstacle avoidance to\nsupport PWD while fostering efficiency and trust between the robot and the\nuser. Our design consists of hardware and software components. Experimental\nresults conducted in a typical indoor social environment demonstrate the\nperformance and effectiveness of the smart wheelchair hardware and software\ndesign. This integrated design promotes trust and autonomy, which are crucial\nfor the acceptance of assistive mobility technologies in the built environment.\n","authors":["Yifan Xu","Qianwei Wang","Jordan Lillie","Vineet Kamat","Carol Menassa"],"pdf_url":"https://arxiv.org/pdf/2501.09680v1.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2501.09668v1","updated":"2025-01-16T17:05:54Z","published":"2025-01-16T17:05:54Z","title":"Model Predictive Path Integral Docking of Fully Actuated Surface Vessel","summary":" Autonomous docking remains one of the most challenging maneuvers in marine\nrobotics, requiring precise control and robust perception in confined spaces.\nThis paper presents a novel approach integrating Model Predictive Path\nIntegral(MPPI) control with real-time LiDAR-based dock detection for autonomous\nsurface vessel docking. Our framework uniquely combines probabilistic\ntrajectory optimization with a multiobjective cost function that simultaneously\nconsiders docking precision, safety constraints, and motion efficiency. The\nMPPI controller generates optimal trajectories by intelligently sampling\ncontrol sequences and evaluating their costs based on dynamic clearance\nrequirements, orientation alignment, and target position objectives. We\nintroduce an adaptive dock detection pipeline that processes LiDAR point clouds\nto extract critical geometric features, enabling real-time updates of docking\nparameters. The proposed method is extensively validated in a physics-based\nsimulation environment that incorporates realistic sensor noise, vessel\ndynamics, and environmental constraints. Results demonstrate successful docking\nfrom various initial positions while maintaining safe clearances and smooth\nmotion characteristics.\n","authors":["Akash Vijayakumar","Atmanand M A","Abhilash Somayajula"],"pdf_url":"https://arxiv.org/pdf/2501.09668v1.pdf","comment":"6 pages, 6 figures, 1 table, UT2025 Conference, IEEE International\n Symposium on Underwater Technology 2025"},{"id":"http://arxiv.org/abs/2412.12406v3","updated":"2025-01-16T16:55:40Z","published":"2024-12-16T23:17:40Z","title":"Global SLAM in Visual-Inertial Systems with 5G Time-of-Arrival\n Integration","summary":" This paper presents a novel approach that integrates 5G Time of Arrival (ToA)\nmeasurements into ORB-SLAM3 to enable global localization and enhance mapping\ncapabilities for indoor drone navigation. We extend ORB-SLAM3's optimization\npipeline to jointly process ToA data from 5G base stations alongside visual and\ninertial measurements while estimating system biases. This integration\ntransforms the inherently local SLAM estimates into globally referenced\ntrajectories and effectively resolves scale ambiguity in monocular\nconfigurations. Our method is evaluated using five real-world indoor datasets\ncollected with RGB-D cameras and inertial measurement units (IMUs),\ncomplemented by simulated 5G ToA measurements at 28 GHz and 78 GHz frequencies\nusing MATLAB and QuaDRiGa. Extensive experiments across four SLAM\nconfigurations (RGB-D, RGB-D-Inertial, Monocular, and Monocular-Inertial)\ndemonstrate that ToA integration enables consistent global positioning across\nall modes while significantly improving local accuracy in minimal sensor\nsetups. Notably, ToA-enhanced monocular SLAM achieves superior local accuracy\n(6.3 cm average) compared to the RGB-D baseline (11.5 cm), and enables reliable\noperation of monocular-inertial SLAM in scenarios where the baseline system\nfails completely. While ToA integration offers limited local accuracy\nimprovements for sensor-rich configurations like RGB-D SLAM, it consistently\nenables robust global localization.\n","authors":["Meisam Kabiri","Holger Voos"],"pdf_url":"https://arxiv.org/pdf/2412.12406v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09649v1","updated":"2025-01-16T16:45:08Z","published":"2025-01-16T16:45:08Z","title":"Monte Carlo Tree Search with Velocity Obstacles for safe and efficient\n motion planning in dynamic environments","summary":" Online motion planning is a challenging problem for intelligent robots moving\nin dense environments with dynamic obstacles, e.g., crowds. In this work, we\npropose a novel approach for optimal and safe online motion planning with\nminimal information about dynamic obstacles. Specifically, our approach\nrequires only the current position of the obstacles and their maximum speed,\nbut it does not need any information about their exact trajectories or dynamic\nmodel. The proposed methodology combines Monte Carlo Tree Search (MCTS), for\nonline optimal planning via model simulations, with Velocity Obstacles (VO),\nfor obstacle avoidance. We perform experiments in a cluttered simulated\nenvironment with walls, and up to 40 dynamic obstacles moving with random\nvelocities and directions. With an ablation study, we show the key contribution\nof VO in scaling up the efficiency of MCTS, selecting the safest and most\nrewarding actions in the tree of simulations. Moreover, we show the superiority\nof our methodology with respect to state-of-the-art planners, including\nNon-linear Model Predictive Control (NMPC), in terms of improved collision\nrate, computational and task performance.\n","authors":["Lorenzo Bonanni","Daniele Meli","Alberto Castellini","Alessandro Farinelli"],"pdf_url":"https://arxiv.org/pdf/2501.09649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12105v3","updated":"2025-01-16T15:23:19Z","published":"2024-07-16T18:23:10Z","title":"AeroHaptix: A Wearable Vibrotactile Feedback System for Enhancing\n Collision Avoidance in UAV Teleoperation","summary":" Haptic feedback enhances collision avoidance by providing directional\nobstacle information to operators during unmanned aerial vehicle (UAV)\nteleoperation. However, such feedback is often rendered via haptic joysticks,\nwhich are unfamiliar to UAV operators and limited to single-direction force\nfeedback. Additionally, the direct coupling between the input device and the\nfeedback method diminishes operators' sense of control and induces oscillatory\nmovements. To overcome these limitations, we propose AeroHaptix, a wearable\nhaptic feedback system that uses spatial vibrations to simultaneously\ncommunicate multiple obstacle directions to operators, without interfering with\ntheir input control. The layout of vibrotactile actuators was optimized via a\nperceptual study to eliminate perceptual biases and achieve uniform spatial\ncoverage. A novel rendering algorithm, MultiCBF, extended control barrier\nfunctions to support multi-directional feedback. Our system evaluation showed\nthat compared to a no-feedback condition, AeroHaptix effectively reduced the\nnumber of collisions and input disagreement. Furthermore, operators reported\nthat AeroHaptix was more helpful than force feedback, with improved situational\nawareness and comparable workload.\n","authors":["Bingjian Huang","Zhecheng Wang","Qilong Cheng","Siyi Ren","Hanfeng Cai","Antonio Alvarez Valdivia","Karthik Mahadevan","Daniel Wigdor"],"pdf_url":"https://arxiv.org/pdf/2407.12105v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09600v1","updated":"2025-01-16T15:22:06Z","published":"2025-01-16T15:22:06Z","title":"Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid\n Prototyping in Virtual Reality Applications","summary":" SLAM is a foundational technique with broad applications in robotics and\nAR/VR. SLAM simulations evaluate new concepts, but testing on\nresource-constrained devices, such as VR HMDs, faces challenges: high\ncomputational cost and restricted sensor data access. This work proposes a\nsparse framework using mesh geometry projections as features, which improves\nefficiency and circumvents direct sensor data access, advancing SLAM research\nas we demonstrate in VR and through numerical evaluation.\n","authors":["Carlos Augusto Pinheiro de Sousa","Heiko Hamann","Oliver Deussen"],"pdf_url":"https://arxiv.org/pdf/2501.09600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09490v1","updated":"2025-01-16T12:01:44Z","published":"2025-01-16T12:01:44Z","title":"Comparison of Various SLAM Systems for Mobile Robot in an Indoor\n Environment","summary":" This article presents a comparative analysis of a mobile robot trajectories\ncomputed by various ROS-based SLAM systems. For this reason we developed a\nprototype of a mobile robot with common sensors: 2D lidar, a monocular and ZED\nstereo cameras. Then we conducted experiments in a typical office environment\nand collected data from all sensors, running all tested SLAM systems based on\nthe acquired dataset. We studied the following SLAM systems: (a) 2D\nlidar-based: GMapping, Hector SLAM, Cartographer; (b) monocular camera-based:\nLarge Scale Direct monocular SLAM (LSD SLAM), ORB SLAM, Direct Sparse Odometry\n(DSO); and (c) stereo camera-based: ZEDfu, Real-Time Appearance-Based Mapping\n(RTAB map), ORB SLAM, Stereo Parallel Tracking and Mapping (S-PTAM). Since all\nSLAM methods were tested on the same dataset we compared results for different\nSLAM systems with appropriate metrics, demonstrating encouraging results for\nlidar-based Cartographer SLAM, Monocular ORB SLAM and Stereo RTAB Map methods.\n","authors":["Maksim Filipenko","Ilya Afanasyev"],"pdf_url":"https://arxiv.org/pdf/2501.09490v1.pdf","comment":"6 pages, 6 figures"},{"id":"http://arxiv.org/abs/2407.16485v3","updated":"2025-01-16T11:59:02Z","published":"2024-07-23T14:00:18Z","title":"Learning Constraint Network from Demonstrations via Positive-Unlabeled\n Learning with Memory Replay","summary":" Planning for a wide range of real-world tasks necessitates to know and write\nall constraints. However, instances exist where these constraints are either\nunknown or challenging to specify accurately. A possible solution is to infer\nthe unknown constraints from expert demonstration. The majority of prior works\nlimit themselves to learning simple linear constraints, or require strong\nknowledge of the true constraint parameterization or environmental model. To\nmitigate these problems, this paper presents a positive-unlabeled (PU) learning\napproach to infer a continuous, arbitrary and possibly nonlinear, constraint\nfrom demonstration. From a PU learning view, We treat all data in\ndemonstrations as positive (feasible) data, and learn a (sub)-optimal policy to\ngenerate high-reward-winning but potentially infeasible trajectories, which\nserve as unlabeled data containing both feasible and infeasible states. Under\nan assumption on data distribution, a feasible-infeasible classifier (i.e.,\nconstraint model) is learned from the two datasets through a postprocessing PU\nlearning technique. The entire method employs an iterative framework\nalternating between updating the policy, which generates and selects\nhigher-reward policies, and updating the constraint model. Additionally, a\nmemory buffer is introduced to record and reuse samples from previous\niterations to prevent forgetting. The effectiveness of the proposed method is\nvalidated in two Mujoco environments, successfully inferring continuous\nnonlinear constraints and outperforming a baseline method in terms of\nconstraint accuracy and policy safety.\n","authors":["Baiyu Peng","Aude Billard"],"pdf_url":"https://arxiv.org/pdf/2407.16485v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09468v1","updated":"2025-01-16T11:10:10Z","published":"2025-01-16T11:10:10Z","title":"Sensorimotor Control Strategies for Tactile Robotics","summary":" How are robots becoming smarter at interacting with their surroundings?\nRecent advances have reshaped how robots use tactile sensing to perceive and\nengage with the world. Tactile sensing is a game-changer, allowing robots to\nembed sensorimotor control strategies to interact with complex environments and\nskillfully handle heterogeneous objects. Such control frameworks plan\ncontact-driven motions while staying responsive to sudden changes. We review\nthe latest methods for building perception and control systems in tactile\nrobotics while offering practical guidelines for their design and\nimplementation. We also address key challenges to shape the future of\nintelligent robots.\n","authors":["Enrico Donato","Matteo Lo Preti","Lucia Beccai","Egidio Falotico"],"pdf_url":"https://arxiv.org/pdf/2501.09468v1.pdf","comment":"39 pages, 8 figures, 1 table"},{"id":"http://arxiv.org/abs/2408.01622v2","updated":"2025-01-16T10:30:40Z","published":"2024-08-03T01:09:48Z","title":"Positive-Unlabeled Constraint Learning for Inferring Nonlinear\n Continuous Constraints Functions from Expert Demonstrations","summary":" Planning for diverse real-world robotic tasks necessitates to know and write\nall constraints. However, instances exist where these constraints are either\nunknown or challenging to specify accurately. A possible solution is to infer\nthe unknown constraints from expert demonstration. This paper presents a novel\ntwo-step Positive-Unlabeled Constraint Learning (PUCL) algorithm to infer a\ncontinuous constraint function from demonstrations, without requiring prior\nknowledge of the true constraint parameterization or environmental model as\nexisting works. We treat all data in demonstrations as positive (feasible)\ndata, and learn a control policy to generate potentially infeasible\ntrajectories, which serve as unlabeled data. The proposed two-step learning\nframework first identifies reliable infeasible data using a distance metric,\nand secondly learns a binary feasibility classifier (i.e., constraint function)\nfrom the feasible demonstrations and reliable infeasible data. The proposed\nmethod is flexible to learn complex-shaped constraint boundary and will not\nmistakenly classify demonstrations as infeasible as previous methods. The\neffectiveness of the proposed method is verified in four constrained\nenvironments, using a networked policy or a dynamical system policy. It\nsuccessfully infers the continuous nonlinear constraints and outperforms other\nbaseline methods in terms of constraint accuracy and policy safety. This work\nhas been published in IEEE Robotics and Automation Letters (RA-L). Please refer\nto the final version at https://doi.org/10.1109/LRA.2024.3522756\n","authors":["Baiyu Peng","Aude Billard"],"pdf_url":"https://arxiv.org/pdf/2408.01622v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09450v1","updated":"2025-01-16T10:25:24Z","published":"2025-01-16T10:25:24Z","title":"Real-Time Generation of Near-Minimum-Energy Trajectories via\n Constraint-Informed Residual Learning","summary":" Industrial robotics demands significant energy to operate, making\nenergy-reduction methodologies increasingly important. Strategies for planning\nminimum-energy trajectories typically involve solving nonlinear optimal control\nproblems (OCPs), which rarely cope with real-time requirements. In this paper,\nwe propose a paradigm for generating near minimum-energy trajectories for\nmanipulators by learning from optimal solutions. Our paradigm leverages a\nresidual learning approach, which embeds boundary conditions while focusing on\nlearning only the adjustments needed to steer a standard solution to an optimal\none. Compared to a computationally expensive OCP-based planner, our paradigm\nachieves 87.3% of the performance near the training dataset and 50.8% far from\nthe dataset, while being two to three orders of magnitude faster.\n","authors":["Domenico Dona'","Giovanni Franzese","Cosimo Della Santina","Paolo Boscariol","Basilio Lenzo"],"pdf_url":"https://arxiv.org/pdf/2501.09450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.20770v2","updated":"2025-01-16T10:24:32Z","published":"2024-12-30T07:41:01Z","title":"Humanoid Robot RHP Friends: Seamless Combination of Autonomous and\n Teleoperated Tasks in a Nursing Context","summary":" This paper describes RHP Friends, a social humanoid robot developed to enable\nassistive robotic deployments in human-coexisting environments. As a use-case\napplication, we present its potential use in nursing by extending its\ncapabilities to operate human devices and tools according to the task and by\nenabling remote assistance operations. To meet a wide variety of tasks and\nsituations in environments designed by and for humans, we developed a system\nthat seamlessly integrates the slim and lightweight robot and several\ntechnologies: locomanipulation, multi-contact motion, teleoperation, and object\ndetection and tracking. We demonstrated the system's usage in a nursing\napplication. The robot efficiently performed the daily task of patient transfer\nand a non-routine task, represented by a request to operate a circuit breaker.\nThis demonstration, held at the 2023 International Robot Exhibition (IREX),\nconducted three times a day over three days.\n","authors":["Mehdi Benallegue","Guillaume Lorthioir","Antonin Dallard","Rafael Cisneros-Limón","Iori Kumagai","Mitsuharu Morisawa","Hiroshi Kaminaga","Masaki Murooka","Antoine Andre","Pierre Gergondet","Kenji Kaneko","Guillaume Caron","Fumio Kanehiro","Abderrahmane Kheddar","Soh Yukizaki","Junichi Karasuyama","Junichi Murakami","Masayuki Kamon"],"pdf_url":"https://arxiv.org/pdf/2412.20770v2.pdf","comment":"IEEE Robotics and Automation Magazine, In press"},{"id":"http://arxiv.org/abs/2411.05548v3","updated":"2025-01-16T08:22:12Z","published":"2024-11-08T13:11:16Z","title":"Equivariant IMU Preintegration with Biases: a Galilean Group Approach","summary":" This letter proposes a new approach for Inertial Measurement Unit (IMU)\npreintegration, a fundamental building block that can be leveraged in different\noptimization-based Inertial Navigation System (INS) localization solutions.\nInspired by recent advances in equivariant theory applied to biased INSs, we\nderive a discrete-time formulation of the IMU preintegration on\n${\\mathbf{Gal}(3) \\ltimes \\mathfrak{gal}(3)}$, the left-trivialization of the\ntangent group of the Galilean group $\\mathbf{Gal}(3)$. We define a novel\npreintegration error that geometrically couples the navigation states and the\nbias leading to lower linearization error. Our method improves in consistency\ncompared to existing preintegration approaches which treat IMU biases as a\nseparate state-space. Extensive validation against state-of-the-art methods,\nboth in simulation and with real-world IMU data, implementation in the Lie++\nlibrary, and open-source code are provided.\n","authors":["Giulio Delama","Alessandro Fornasier","Robert Mahony","Stephan Weiss"],"pdf_url":"https://arxiv.org/pdf/2411.05548v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09357v1","updated":"2025-01-16T08:08:42Z","published":"2025-01-16T08:08:42Z","title":"Path Planning for a UAV Swarm Using Formation Teaching-Learning-Based\n Optimization","summary":" This work addresses the path planning problem for a group of unmanned aerial\nvehicles (UAVs) to maintain a desired formation during operation. Our approach\nformulates the problem as an optimization task by defining a set of fitness\nfunctions that not only ensure the formation but also include constraints for\noptimal and safe UAV operation. To optimize the fitness function and obtain a\nsuboptimal path, we employ the teaching-learning-based optimization algorithm\nand then further enhance it with mechanisms such as mutation, elite strategy,\nand multi-subject combination. A number of simulations and experiments have\nbeen conducted to evaluate the proposed method. The results demonstrate that\nthe algorithm successfully generates valid paths for the UAVs to fly in a\ntriangular formation for an inspection task.\n","authors":["Van Truong Hoang","Manh Duong Phung"],"pdf_url":"https://arxiv.org/pdf/2501.09357v1.pdf","comment":"in Proceedings of the 2025 International Conference on Energy,\n Infrastructure and Environmental Research (EIER2025)"},{"id":"http://arxiv.org/abs/2501.09338v1","updated":"2025-01-16T07:38:56Z","published":"2025-01-16T07:38:56Z","title":"Robust UAV Path Planning with Obstacle Avoidance for Emergency Rescue","summary":" The unmanned aerial vehicles (UAVs) are efficient tools for diverse tasks\nsuch as electronic reconnaissance, agricultural operations and disaster relief.\nIn the complex three-dimensional (3D) environments, the path planning with\nobstacle avoidance for UAVs is a significant issue for security assurance. In\nthis paper, we construct a comprehensive 3D scenario with obstacles and no-fly\nzones for dynamic UAV trajectory. Moreover, a novel artificial potential field\nalgorithm coupled with simulated annealing (APF-SA) is proposed to tackle the\nrobust path planning problem. APF-SA modifies the attractive and repulsive\npotential functions and leverages simulated annealing to escape local minimum\nand converge to globally optimal solutions. Simulation results demonstrate that\nthe effectiveness of APF-SA, enabling efficient autonomous path planning for\nUAVs with obstacle avoidance.\n","authors":["Junteng Mao","Ziye Jia","Hanzhi Gu","Chenyu Shi","Haomin Shi","Lijun He","Qihui Wu"],"pdf_url":"https://arxiv.org/pdf/2501.09338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.07259v2","updated":"2025-01-16T07:26:52Z","published":"2025-01-13T12:14:48Z","title":"PO-GVINS: Tightly Coupled GNSS-Visual-Inertial Integration with\n Pose-Only Representation","summary":" Accurate and reliable positioning is crucial for perception, decision-making,\nand other high-level applications in autonomous driving, unmanned aerial\nvehicles, and intelligent robots. Given the inherent limitations of standalone\nsensors, integrating heterogeneous sensors with complementary capabilities is\none of the most effective approaches to achieving this goal. In this paper, we\npropose a filtering-based, tightly coupled global navigation satellite system\n(GNSS)-visual-inertial positioning framework with a pose-only formulation\napplied to the visual-inertial system (VINS), termed PO-GVINS. Specifically,\nmultiple-view imaging used in current VINS requires a priori of 3D feature,\nthen jointly estimate camera poses and 3D feature position, which inevitably\nintroduces linearization error of the feature as well as facing dimensional\nexplosion. However, the pose-only (PO) formulation, which is demonstrated to be\nequivalent to the multiple-view imaging and has been applied in visual\nreconstruction, represent feature depth using two camera poses and thus 3D\nfeature position is removed from state vector avoiding aforementioned\ndifficulties. Inspired by this, we first apply PO formulation in our VINS,\ni.e., PO-VINS. GNSS raw measurements are then incorporated with integer\nambiguity resolved to achieve accurate and drift-free estimation. Extensive\nexperiments demonstrate that the proposed PO-VINS significantly outperforms the\nmulti-state constrained Kalman filter (MSCKF). By incorporating GNSS\nmeasurements, PO-GVINS achieves accurate, drift-free state estimation, making\nit a robust solution for positioning in challenging environments.\n","authors":["Zhuo Xu","Feng Zhu","Zihang Zhang","Chang Jian","Jiarui Lv","Yuantai Zhang","Xiaohong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.07259v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09307v1","updated":"2025-01-16T05:40:37Z","published":"2025-01-16T05:40:37Z","title":"RoboReflect: Robotic Reflective Reasoning for Grasping\n Ambiguous-Condition Objects","summary":" As robotic technology rapidly develops, robots are being employed in an\nincreasing number of fields. However, due to the complexity of deployment\nenvironments or the prevalence of ambiguous-condition objects, the practical\napplication of robotics still faces many challenges, leading to frequent\nerrors. Traditional methods and some LLM-based approaches, although improved,\nstill require substantial human intervention and struggle with autonomous error\ncorrection in complex scenarios.In this work, we propose RoboReflect, a novel\nframework leveraging large vision-language models (LVLMs) to enable\nself-reflection and autonomous error correction in robotic grasping tasks.\nRoboReflect allows robots to automatically adjust their strategies based on\nunsuccessful attempts until successful execution is achieved.The corrected\nstrategies are saved in a memory for future task reference.We evaluate\nRoboReflect through extensive testing on eight common objects prone to\nambiguous conditions of three categories.Our results demonstrate that\nRoboReflect not only outperforms existing grasp pose estimation methods like\nAnyGrasp and high-level action planning techniques using GPT-4V but also\nsignificantly enhances the robot's ability to adapt and correct errors\nindependently. These findings underscore the critical importance of autonomous\nselfreflection in robotic systems while effectively addressing the challenges\nposed by ambiguous environments.\n","authors":["Zhen Luo","Yixuan Yang","Chang Cai","Yanfu Zhang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.09307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09290v1","updated":"2025-01-16T04:50:15Z","published":"2025-01-16T04:50:15Z","title":"Interoceptive Robots for Convergent Shared Control in Collaborative\n Construction Work","summary":" Building autonomous mobile robots (AMRs) with optimized efficiency and\nadaptive capabilities-able to respond to changing task demands and dynamic\nenvironments-is a strongly desired goal for advancing construction robotics.\nSuch robots can play a critical role in enabling automation, reducing\noperational carbon footprints, and supporting modular construction processes.\nInspired by the adaptive autonomy of living organisms, we introduce\ninteroception, which centers on the robot's internal state representation, as a\nfoundation for developing self-reflection and conscious learning to enable\ncontinual learning and adaptability in robotic agents. In this paper, we\nfactorize internal state variables and mathematical properties as \"cognitive\ndissonance\" in shared control paradigms, where human interventions occasionally\noccur. We offer a new perspective on how interoception can help build adaptive\nmotion planning in AMRs by integrating the legacy of heuristic costs from\ngrid/graph-based algorithms with recent advances in neuroscience and\nreinforcement learning. Declarative and procedural knowledge extracted from\nhuman semantic inputs is encoded into a hypergraph model that overlaps with the\nspatial configuration of onsite layout for path planning. In addition, we\ndesign a velocity-replay module using an encoder-decoder architecture with\nfew-shot learning to enable robots to replicate velocity profiles in\ncontextualized scenarios for multi-robot synchronization and handover\ncollaboration. These \"cached\" knowledge representations are demonstrated in\nsimulated environments for multi-robot motion planning and stacking tasks. The\ninsights from this study pave the way toward artificial general intelligence in\nAMRs, fostering their progression from complexity to competence in construction\nautomation.\n","authors":["Xiaoshan Zhou","Carol C. Menassa","Vineet R. Kamat"],"pdf_url":"https://arxiv.org/pdf/2501.09290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09273v1","updated":"2025-01-16T03:44:14Z","published":"2025-01-16T03:44:14Z","title":"ThinTact:Thin Vision-Based Tactile Sensor by Lensless Imaging","summary":" Vision-based tactile sensors have drawn increasing interest in the robotics\ncommunity. However, traditional lens-based designs impose minimum thickness\nconstraints on these sensors, limiting their applicability in space-restricted\nsettings. In this paper, we propose ThinTact, a novel lensless vision-based\ntactile sensor with a sensing field of over 200 mm2 and a thickness of less\nthan 10 mm.ThinTact utilizes the mask-based lensless imaging technique to map\nthe contact information to CMOS signals. To ensure real-time tactile sensing,\nwe propose a real-time lensless reconstruction algorithm that leverages a\nfrequency-spatial-domain joint filter based on discrete cosine transform (DCT).\nThis algorithm achieves computation significantly faster than existing\noptimization-based methods. Additionally, to improve the sensing quality, we\ndevelop a mask optimization method based on the generic algorithm and the\ncorresponding system matrix calibration algorithm.We evaluate the performance\nof our proposed lensless reconstruction and tactile sensing through qualitative\nand quantitative experiments. Furthermore, we demonstrate ThinTact's practical\napplicability in diverse applications, including texture recognition and\ncontact-rich object manipulation. The paper will appear in the IEEE\nTransactions on Robotics: https://ieeexplore.ieee.org/document/10842357. Video:\nhttps://youtu.be/YrOO9BDMAHo\n","authors":["Jing Xu","Weihang Chen","Hongyu Qian","Dan Wu","Rui Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09273v1.pdf","comment":"\\c{opyright} 2025 IEEE. Personal use of this material is permitted.\n Permission from IEEE must be obtained for all other uses, in any current or\n future media, including reprinting/republishing this material for advertising\n or promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2501.09267v1","updated":"2025-01-16T03:34:36Z","published":"2025-01-16T03:34:36Z","title":"Are Open-Vocabulary Models Ready for Detection of MEP Elements on\n Construction Sites","summary":" The construction industry has long explored robotics and computer vision, yet\ntheir deployment on construction sites remains very limited. These technologies\nhave the potential to revolutionize traditional workflows by enhancing\naccuracy, efficiency, and safety in construction management. Ground robots\nequipped with advanced vision systems could automate tasks such as monitoring\nmechanical, electrical, and plumbing (MEP) systems. The present research\nevaluates the applicability of open-vocabulary vision-language models compared\nto fine-tuned, lightweight, closed-set object detectors for detecting MEP\ncomponents using a mobile ground robotic platform. A dataset collected with\ncameras mounted on a ground robot was manually annotated and analyzed to\ncompare model performance. The results demonstrate that, despite the\nversatility of vision-language models, fine-tuned lightweight models still\nlargely outperform them in specialized environments and for domain-specific\ntasks.\n","authors":["Abdalwhab Abdalwhab","Ali Imran","Sina Heydarian","Ivanka Iordanova","David St-Onge"],"pdf_url":"https://arxiv.org/pdf/2501.09267v1.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2405.13345v2","updated":"2025-01-16T02:37:08Z","published":"2024-05-22T05:04:44Z","title":"Autonomous Algorithm for Training Autonomous Vehicles with Minimal Human\n Intervention","summary":" Recent reinforcement learning (RL) algorithms have demonstrated impressive\nresults in simulated driving environments. However, autonomous vehicles trained\nin simulation often struggle to work well in the real world due to the fidelity\ngap between simulated and real-world environments. While directly training\nreal-world autonomous vehicles with RL algorithms is a promising approach to\nbypass the fidelity gap problem, it presents several challenges. One critical\nyet often overlooked challenge is the need to reset a driving environment\nbetween every episode. This reset process demands significant human\nintervention, leading to poor training efficiency in the real world. In this\npaper, we introduce a novel autonomous algorithm that enables off-the-shelf RL\nalgorithms to train autonomous vehicles with minimal human intervention. Our\nalgorithm reduces unnecessary human intervention by aborting episodes to\nprevent unsafe states and identifying informative initial states for subsequent\nepisodes. The key idea behind identifying informative initial states is to\nestimate the expected amount of information that can be obtained from\nunder-explored but reachable states. Our algorithm also revisits rule-based\nautonomous driving algorithms and highlights their benefits in safely returning\nan autonomous vehicle to initial states. To evaluate how much human\nintervention is required during training, we implement challenging urban\ndriving tasks that require an autonomous vehicle to reset to initial states on\nits own. The experimental results show that our autonomous algorithm is\ntask-agnostic and achieves competitive driving performance with much less human\nintervention than baselines.\n","authors":["Sang-Hyun Lee","Daehyeok Kwon","Seung-Woo Seo"],"pdf_url":"https://arxiv.org/pdf/2405.13345v2.pdf","comment":"8 pages, 6 figures, 2 tables, conference"},{"id":"http://arxiv.org/abs/2405.00846v4","updated":"2025-01-16T01:49:35Z","published":"2024-05-01T20:21:44Z","title":"Gameplay Filters: Robust Zero-Shot Safety through Adversarial\n Imagination","summary":" Despite the impressive recent advances in learning-based robot control,\nensuring robustness to out-of-distribution conditions remains an open\nchallenge. Safety filters can, in principle, keep arbitrary control policies\nfrom incurring catastrophic failures by overriding unsafe actions, but existing\nsolutions for complex (e.g., legged) robot dynamics do not span the full motion\nenvelope and instead rely on local, reduced-order models. These filters tend to\noverly restrict agility and can still fail when perturbed away from nominal\nconditions. This paper presents the gameplay filter, a new class of predictive\nsafety filter that continually plays out hypothetical matches between its\nsimulation-trained safety strategy and a virtual adversary co-trained to invoke\nworst-case events and sim-to-real error, and precludes actions that would cause\nfailures down the line. We demonstrate the scalability and robustness of the\napproach with a first-of-its-kind full-order safety filter for (36-D)\nquadrupedal dynamics. Physical experiments on two different quadruped platforms\ndemonstrate the superior zero-shot effectiveness of the gameplay filter under\nlarge perturbations such as tugging and unmodeled terrain. Experiment videos\nand open-source software are available online:\nhttps://saferobotics.org/research/gameplay-filter\n","authors":["Duy P. Nguyen","Kai-Chieh Hsu","Wenhao Yu","Jie Tan","Jaime F. Fisac"],"pdf_url":"https://arxiv.org/pdf/2405.00846v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15922v4","updated":"2025-01-16T23:59:18Z","published":"2024-09-24T09:45:20Z","title":"The Dark Side of Rich Rewards: Understanding and Mitigating Noise in VLM\n Rewards","summary":" While Vision-Language Models (VLMs) are increasingly used to generate reward\nsignals for training embodied agents to follow instructions, our research\nreveals that agents guided by VLM rewards often underperform compared to those\nemploying only intrinsic (exploration-driven) rewards, contradicting\nexpectations set by recent work. We hypothesize that false positive rewards --\ninstances where unintended trajectories are incorrectly rewarded -- are more\ndetrimental than false negatives. Our analysis confirms this hypothesis,\nrevealing that the widely used cosine similarity metric is prone to false\npositive reward estimates. To address this, we introduce BiMI ({Bi}nary\n{M}utual {I}nformation), a novel reward function designed to mitigate noise.\nBiMI significantly enhances learning efficiency across diverse and challenging\nembodied navigation environments. Our findings offer a nuanced understanding of\nhow different types of reward noise impact agent learning and highlight the\nimportance of addressing multimodal reward signal noise when training embodied\nagents\n","authors":["Sukai Huang","Shu-Wei Liu","Nir Lipovetzky","Trevor Cohn"],"pdf_url":"https://arxiv.org/pdf/2409.15922v4.pdf","comment":"11 main body pages, 21 appendix pages"},{"id":"http://arxiv.org/abs/2409.20539v3","updated":"2025-01-16T20:14:13Z","published":"2024-09-30T17:41:00Z","title":"Visual collective behaviors on spherical robots","summary":" The implementation of collective motion, traditionally, disregard the limited\nsensing capabilities of an individual, to instead assuming an omniscient\nperception of the environment. This study implements a visual flocking model in\na ``robot-in-the-loop'' approach to reproduce these behaviors with a flock\ncomposed of 10 independent spherical robots. The model achieves robotic\ncollective motion by only using panoramic visual information of each robot,\nsuch as retinal position, optical size and optic flow of the neighboring\nrobots. We introduce a virtual anchor to confine the collective robotic\nmovements so to avoid wall interactions. For the first time, a simple visual\nrobot-in-the-loop approach succeed in reproducing several collective motion\nphases, in particular, swarming, and milling. Another milestone achieved with\nby this model is bridging the gap between simulation and physical experiments\nby demonstrating nearly identical behaviors in both environments with the same\nvisual model. To conclude, we show that our minimal visual collective motion\nmodel is sufficient to recreate most collective behaviors on a\nrobot-in-the-loop system that is scalable, behaves as numerical simulations\npredict and is easily comparable to traditional models.\n","authors":["Diego Castro","Christophe Eloy","Franck Ruffier"],"pdf_url":"https://arxiv.org/pdf/2409.20539v3.pdf","comment":"26 pages, 16 figures, journal bioinspired and biomimetics"},{"id":"http://arxiv.org/abs/2501.09819v1","updated":"2025-01-16T20:09:40Z","published":"2025-01-16T20:09:40Z","title":"Torque Responsive Metamaterials Enable High Payload Soft Robot Arms","summary":" Soft robots have struggled to support large forces and moments while also\nsupporting their own weight against gravity. This limits their ability to reach\ncertain configurations necessary for tasks such as inspection and pushing\nobjects up. We have overcome this limitation by creating an electrically driven\nmetamaterial soft arm using handed shearing auxetics (HSA) and bendable\nextendable torque resistant (BETR) shafts. These use the large force and torque\ncapacity of HSAs and the nestable torque transmission of BETRs to create a\nstrong soft arm. We found that the HSA arm was able to push 2.3 kg vertically\nand lift more than 600 g when positioned horizontally, supporting 0.33 Nm of\ntorque at the base. The arm is able to move between waypoints while carrying\nthe large payload and demonstrates consistent movement with path variance below\n5 mm. The HSA arm's ability to perform active grasping with HSA grippers was\nalso demonstrated, requiring 20 N of pull force to dislodge the object.\nFinally, we test the arm in a pipe inspection task. The arm is able to locate\nall the defects while sliding against the inner surface of the pipe,\ndemonstrating its compliance.\n","authors":["Ian Good","Srivatsan Balaji","David Oh","Sawyer Thomas","Jeffrey I. Lipton"],"pdf_url":"https://arxiv.org/pdf/2501.09819v1.pdf","comment":"9 pages, 8 figures, currently under review"},{"id":"http://arxiv.org/abs/2501.09783v1","updated":"2025-01-16T18:59:51Z","published":"2025-01-16T18:59:51Z","title":"GeoManip: Geometric Constraints as General Interfaces for Robot\n Manipulation","summary":" We present GeoManip, a framework to enable generalist robots to leverage\nessential conditions derived from object and part relationships, as geometric\nconstraints, for robot manipulation. For example, cutting the carrot requires\nadhering to a geometric constraint: the blade of the knife should be\nperpendicular to the carrot's direction. By interpreting these constraints\nthrough symbolic language representations and translating them into low-level\nactions, GeoManip bridges the gap between natural language and robotic\nexecution, enabling greater generalizability across diverse even unseen tasks,\nobjects, and scenarios. Unlike vision-language-action models that require\nextensive training, operates training-free by utilizing large foundational\nmodels: a constraint generation module that predicts stage-specific geometric\nconstraints and a geometry parser that identifies object parts involved in\nthese constraints. A solver then optimizes trajectories to satisfy inferred\nconstraints from task descriptions and the scene. Furthermore, GeoManip learns\nin-context and provides five appealing human-robot interaction features:\non-the-fly policy adaptation, learning from human demonstrations, learning from\nfailure cases, long-horizon action planning, and efficient data collection for\nimitation learning. Extensive evaluations on both simulations and real-world\nscenarios demonstrate GeoManip's state-of-the-art performance, with superior\nout-of-distribution generalization while avoiding costly model training.\n","authors":["Weiliang Tang","Jia-Hui Pan","Yun-Hui Liu","Masayoshi Tomizuka","Li Erran Li","Chi-Wing Fu","Mingyu Ding"],"pdf_url":"https://arxiv.org/pdf/2501.09783v1.pdf","comment":"32 pages, 13 figures"},{"id":"http://arxiv.org/abs/2501.09782v1","updated":"2025-01-16T18:59:46Z","published":"2025-01-16T18:59:46Z","title":"SMPLest-X: Ultimate Scaling for Expressive Human Pose and Shape\n Estimation","summary":" Expressive human pose and shape estimation (EHPS) unifies body, hands, and\nface motion capture with numerous applications. Despite encouraging progress,\ncurrent state-of-the-art methods focus on training innovative architectural\ndesigns on confined datasets. In this work, we investigate the impact of\nscaling up EHPS towards a family of generalist foundation models. 1) For data\nscaling, we perform a systematic investigation on 40 EHPS datasets,\nencompassing a wide range of scenarios that a model trained on any single\ndataset cannot handle. More importantly, capitalizing on insights obtained from\nthe extensive benchmarking process, we optimize our training scheme and select\ndatasets that lead to a significant leap in EHPS capabilities. Ultimately, we\nachieve diminishing returns at 10M training instances from diverse data\nsources. 2) For model scaling, we take advantage of vision transformers (up to\nViT-Huge as the backbone) to study the scaling law of model sizes in EHPS. To\nexclude the influence of algorithmic design, we base our experiments on two\nminimalist architectures: SMPLer-X, which consists of an intermediate step for\nhand and face localization, and SMPLest-X, an even simpler version that reduces\nthe network to its bare essentials and highlights significant advances in the\ncapture of articulated hands. With big data and the large model, the foundation\nmodels exhibit strong performance across diverse test benchmarks and excellent\ntransferability to even unseen environments. Moreover, our finetuning strategy\nturns the generalist into specialist models, allowing them to achieve further\nperformance boosts. Notably, our foundation models consistently deliver\nstate-of-the-art results on seven benchmarks such as AGORA, UBody, EgoBody, and\nour proposed SynHand dataset for comprehensive hand evaluation. (Code is\navailable at: https://github.com/wqyin/SMPLest-X).\n","authors":["Wanqi Yin","Zhongang Cai","Ruisi Wang","Ailing Zeng","Chen Wei","Qingping Sun","Haiyi Mei","Yanjun Wang","Hui En Pang","Mingyuan Zhang","Lei Zhang","Chen Change Loy","Atsushi Yamashita","Lei Yang","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2501.09782v1.pdf","comment":"An extension of SMPLer-X [arXiv:2309.17448]. Homepage:\n https://caizhongang.com/projects/SMPLer-X/"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.09757v1","updated":"2025-01-16T18:59:53Z","published":"2025-01-16T18:59:53Z","title":"Distilling Multi-modal Large Language Models for Autonomous Driving","summary":" Autonomous driving demands safe motion planning, especially in critical\n\"long-tail\" scenarios. Recent end-to-end autonomous driving systems leverage\nlarge language models (LLMs) as planners to improve generalizability to rare\nevents. However, using LLMs at test time introduces high computational costs.\nTo address this, we propose DiMA, an end-to-end autonomous driving system that\nmaintains the efficiency of an LLM-free (or vision-based) planner while\nleveraging the world knowledge of an LLM. DiMA distills the information from a\nmulti-modal LLM to a vision-based end-to-end planner through a set of specially\ndesigned surrogate tasks. Under a joint training strategy, a scene encoder\ncommon to both networks produces structured representations that are\nsemantically grounded as well as aligned to the final planning objective.\nNotably, the LLM is optional at inference, enabling robust planning without\ncompromising on efficiency. Training with DiMA results in a 37% reduction in\nthe L2 trajectory error and an 80% reduction in the collision rate of the\nvision-based planner, as well as a 44% trajectory error reduction in longtail\nscenarios. DiMA also achieves state-of-the-art performance on the nuScenes\nplanning benchmark.\n","authors":["Deepti Hegde","Rajeev Yasarla","Hong Cai","Shizhong Han","Apratim Bhattacharyya","Shweta Mahajan","Litian Liu","Risheek Garrepalli","Vishal M. Patel","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2501.09757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09756v1","updated":"2025-01-16T18:59:48Z","published":"2025-01-16T18:59:48Z","title":"SynthLight: Portrait Relighting with Diffusion Model by Learning to\n Re-render Synthetic Faces","summary":" We introduce SynthLight, a diffusion model for portrait relighting. Our\napproach frames image relighting as a re-rendering problem, where pixels are\ntransformed in response to changes in environmental lighting conditions. Using\na physically-based rendering engine, we synthesize a dataset to simulate this\nlighting-conditioned transformation with 3D head assets under varying lighting.\nWe propose two training and inference strategies to bridge the gap between the\nsynthetic and real image domains: (1) multi-task training that takes advantage\nof real human portraits without lighting labels; (2) an inference time\ndiffusion sampling procedure based on classifier-free guidance that leverages\nthe input portrait to better preserve details. Our method generalizes to\ndiverse real photographs and produces realistic illumination effects, including\nspecular highlights and cast shadows, while preserving the subject's identity.\nOur quantitative experiments on Light Stage data demonstrate results comparable\nto state-of-the-art relighting methods. Our qualitative results on in-the-wild\nimages showcase rich and unprecedented illumination effects. Project Page:\n\\url{https://vrroom.github.io/synthlight/}\n","authors":["Sumit Chaturvedi","Mengwei Ren","Yannick Hold-Geoffroy","Jingyuan Liu","Julie Dorsey","Zhixin Shu"],"pdf_url":"https://arxiv.org/pdf/2501.09756v1.pdf","comment":"27 pages, 25 figures, Project Page\n https://vrroom.github.io/synthlight/"},{"id":"http://arxiv.org/abs/2501.09755v1","updated":"2025-01-16T18:59:04Z","published":"2025-01-16T18:59:04Z","title":"Learnings from Scaling Visual Tokenizers for Reconstruction and\n Generation","summary":" Visual tokenization via auto-encoding empowers state-of-the-art image and\nvideo generative models by compressing pixels into a latent space. Although\nscaling Transformer-based generators has been central to recent advances, the\ntokenizer component itself is rarely scaled, leaving open questions about how\nauto-encoder design choices influence both its objective of reconstruction and\ndownstream generative performance. Our work aims to conduct an exploration of\nscaling in auto-encoders to fill in this blank. To facilitate this exploration,\nwe replace the typical convolutional backbone with an enhanced Vision\nTransformer architecture for Tokenization (ViTok). We train ViTok on\nlarge-scale image and video datasets far exceeding ImageNet-1K, removing data\nconstraints on tokenizer scaling. We first study how scaling the auto-encoder\nbottleneck affects both reconstruction and generation -- and find that while it\nis highly correlated with reconstruction, its relationship with generation is\nmore complex. We next explored the effect of separately scaling the\nauto-encoders' encoder and decoder on reconstruction and generation\nperformance. Crucially, we find that scaling the encoder yields minimal gains\nfor either reconstruction or generation, while scaling the decoder boosts\nreconstruction but the benefits for generation are mixed. Building on our\nexploration, we design ViTok as a lightweight auto-encoder that achieves\ncompetitive performance with state-of-the-art auto-encoders on ImageNet-1K and\nCOCO reconstruction tasks (256p and 512p) while outperforming existing\nauto-encoders on 16-frame 128p video reconstruction for UCF-101, all with 2-5x\nfewer FLOPs. When integrated with Diffusion Transformers, ViTok demonstrates\ncompetitive performance on image generation for ImageNet-1K and sets new\nstate-of-the-art benchmarks for class-conditional video generation on UCF-101.\n","authors":["Philippe Hansen-Estruch","David Yan","Ching-Yao Chung","Orr Zohar","Jialiang Wang","Tingbo Hou","Tao Xu","Sriram Vishwanath","Peter Vajda","Xinlei Chen"],"pdf_url":"https://arxiv.org/pdf/2501.09755v1.pdf","comment":"28 pages, 25 figures, 7 Tables"},{"id":"http://arxiv.org/abs/2501.09754v1","updated":"2025-01-16T18:59:03Z","published":"2025-01-16T18:59:03Z","title":"Lost in Translation, Found in Context: Sign Language Translation with\n Contextual Cues","summary":" Our objective is to translate continuous sign language into spoken language\ntext. Inspired by the way human interpreters rely on context for accurate\ntranslation, we incorporate additional contextual cues together with the\nsigning video, into a new translation framework. Specifically, besides visual\nsign recognition features that encode the input video, we integrate\ncomplementary textual information from (i) captions describing the background\nshow, (ii) translation of previous sentences, as well as (iii) pseudo-glosses\ntranscribing the signing. These are automatically extracted and inputted along\nwith the visual features to a pre-trained large language model (LLM), which we\nfine-tune to generate spoken language translations in text form. Through\nextensive ablation studies, we show the positive contribution of each input cue\nto the translation performance. We train and evaluate our approach on BOBSL --\nthe largest British Sign Language dataset currently available. We show that our\ncontextual approach significantly enhances the quality of the translations\ncompared to previously reported results on BOBSL, and also to state-of-the-art\nmethods that we implement as baselines. Furthermore, we demonstrate the\ngenerality of our approach by applying it also to How2Sign, an American Sign\nLanguage dataset, and achieve competitive results.\n","authors":["Youngjoon Jang","Haran Raajesh","Liliane Momeni","Gül Varol","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2501.09754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09753v1","updated":"2025-01-16T18:59:02Z","published":"2025-01-16T18:59:02Z","title":"SRE-Conv: Symmetric Rotation Equivariant Convolution for Biomedical\n Image Classification","summary":" Convolutional neural networks (CNNs) are essential tools for computer vision\ntasks, but they lack traditionally desired properties of extracted features\nthat could further improve model performance, e.g., rotational equivariance.\nSuch properties are ubiquitous in biomedical images, which often lack explicit\norientation. While current work largely relies on data augmentation or explicit\nmodules to capture orientation information, this comes at the expense of\nincreased training costs or ineffective approximations of the desired\nequivariance. To overcome these challenges, we propose a novel and efficient\nimplementation of the Symmetric Rotation-Equivariant (SRE) Convolution\n(SRE-Conv) kernel, designed to learn rotation-invariant features while\nsimultaneously compressing the model size. The SRE-Conv kernel can easily be\nincorporated into any CNN backbone. We validate the ability of a deep SRE-CNN\nto capture equivariance to rotation using the public MedMNISTv2 dataset (16\ntotal tasks). SRE-Conv-CNN demonstrated improved rotated image classification\nperformance accuracy on all 16 test datasets in both 2D and 3D images, all\nwhile increasing efficiency with fewer parameters and reduced memory footprint.\nThe code is available at https://github.com/XYPB/SRE-Conv.\n","authors":["Yuexi Du","Jiazhen Zhang","Tal Zeevi","Nicha C. Dvornek","John A. Onofrey"],"pdf_url":"https://arxiv.org/pdf/2501.09753v1.pdf","comment":"Accepted by IEEE ISBI 2025 4-page paper"},{"id":"http://arxiv.org/abs/2403.12953v2","updated":"2025-01-16T18:58:31Z","published":"2024-03-19T17:55:22Z","title":"FutureDepth: Learning to Predict the Future Improves Video Depth\n Estimation","summary":" In this paper, we propose a novel video depth estimation approach,\nFutureDepth, which enables the model to implicitly leverage multi-frame and\nmotion cues to improve depth estimation by making it learn to predict the\nfuture at training. More specifically, we propose a future prediction network,\nF-Net, which takes the features of multiple consecutive frames and is trained\nto predict multi-frame features one time step ahead iteratively. In this way,\nF-Net learns the underlying motion and correspondence information, and we\nincorporate its features into the depth decoding process. Additionally, to\nenrich the learning of multiframe correspondence cues, we further leverage a\nreconstruction network, R-Net, which is trained via adaptively masked\nauto-encoding of multiframe feature volumes. At inference time, both F-Net and\nR-Net are used to produce queries to work with the depth decoder, as well as a\nfinal refinement network. Through extensive experiments on several benchmarks,\ni.e., NYUDv2, KITTI, DDAD, and Sintel, which cover indoor, driving, and\nopen-domain scenarios, we show that FutureDepth significantly improves upon\nbaseline models, outperforms existing video depth estimation methods, and sets\nnew state-of-the-art (SOTA) accuracy. Furthermore, FutureDepth is more\nefficient than existing SOTA video depth estimation models and has similar\nlatencies when comparing to monocular models\n","authors":["Rajeev Yasarla","Manish Kumar Singh","Hong Cai","Yunxiao Shi","Jisoo Jeong","Yinhao Zhu","Shizhong Han","Risheek Garrepalli","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2403.12953v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2307.14336v3","updated":"2025-01-16T18:55:36Z","published":"2023-07-26T17:55:32Z","title":"MAMo: Leveraging Memory and Attention for Monocular Video Depth\n Estimation","summary":" We propose MAMo, a novel memory and attention frame-work for monocular video\ndepth estimation. MAMo can augment and improve any single-image depth\nestimation networks into video depth estimation models, enabling them to take\nadvantage of the temporal information to predict more accurate depth. In MAMo,\nwe augment model with memory which aids the depth prediction as the model\nstreams through the video. Specifically, the memory stores learned visual and\ndisplacement tokens of the previous time instances. This allows the depth\nnetwork to cross-reference relevant features from the past when predicting\ndepth on the current frame. We introduce a novel scheme to continuously update\nthe memory, optimizing it to keep tokens that correspond with both the past and\nthe present visual information. We adopt attention-based approach to process\nmemory features where we first learn the spatio-temporal relation among the\nresultant visual and displacement memory tokens using self-attention module.\nFurther, the output features of self-attention are aggregated with the current\nvisual features through cross-attention. The cross-attended features are\nfinally given to a decoder to predict depth on the current frame. Through\nextensive experiments on several benchmarks, including KITTI, NYU-Depth V2, and\nDDAD, we show that MAMo consistently improves monocular depth estimation\nnetworks and sets new state-of-the-art (SOTA) accuracy. Notably, our MAMo video\ndepth estimation provides higher accuracy with lower latency, when omparing to\nSOTA cost-volume-based video depth models.\n","authors":["Rajeev Yasarla","Hong Cai","Jisoo Jeong","Yunxiao Shi","Risheek Garrepalli","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2307.14336v3.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2501.09733v1","updated":"2025-01-16T18:35:45Z","published":"2025-01-16T18:35:45Z","title":"ComplexVAD: Detecting Interaction Anomalies in Video","summary":" Existing video anomaly detection datasets are inadequate for representing\ncomplex anomalies that occur due to the interactions between objects. The\nabsence of complex anomalies in previous video anomaly detection datasets\naffects research by shifting the focus onto simple anomalies. To address this\nproblem, we introduce a new large-scale dataset: ComplexVAD. In addition, we\npropose a novel method to detect complex anomalies via modeling the\ninteractions between objects using a scene graph with spatio-temporal\nattributes. With our proposed method and two other state-of-the-art video\nanomaly detection methods, we obtain baseline scores on ComplexVAD and\ndemonstrate that our new method outperforms existing works.\n","authors":["Furkan Mumcu","Michael J. Jones","Yasin Yilmaz","Anoop Cherian"],"pdf_url":"https://arxiv.org/pdf/2501.09733v1.pdf","comment":"16 pages, 11 figures, to appear in WACV Workshop ASTAD 2025"},{"id":"http://arxiv.org/abs/2501.09732v1","updated":"2025-01-16T18:30:37Z","published":"2025-01-16T18:30:37Z","title":"Inference-Time Scaling for Diffusion Models beyond Scaling Denoising\n Steps","summary":" Generative models have made significant impacts across various domains,\nlargely due to their ability to scale during training by increasing data,\ncomputational resources, and model size, a phenomenon characterized by the\nscaling laws. Recent research has begun to explore inference-time scaling\nbehavior in Large Language Models (LLMs), revealing how performance can further\nimprove with additional computation during inference. Unlike LLMs, diffusion\nmodels inherently possess the flexibility to adjust inference-time computation\nvia the number of denoising steps, although the performance gains typically\nflatten after a few dozen. In this work, we explore the inference-time scaling\nbehavior of diffusion models beyond increasing denoising steps and investigate\nhow the generation performance can further improve with increased computation.\nSpecifically, we consider a search problem aimed at identifying better noises\nfor the diffusion sampling process. We structure the design space along two\naxes: the verifiers used to provide feedback, and the algorithms used to find\nbetter noise candidates. Through extensive experiments on class-conditioned and\ntext-conditioned image generation benchmarks, our findings reveal that\nincreasing inference-time compute leads to substantial improvements in the\nquality of samples generated by diffusion models, and with the complicated\nnature of images, combinations of the components in the framework can be\nspecifically chosen to conform with different application scenario.\n","authors":["Nanye Ma","Shangyuan Tong","Haolin Jia","Hexiang Hu","Yu-Chuan Su","Mingda Zhang","Xuan Yang","Yandong Li","Tommi Jaakkola","Xuhui Jia","Saining Xie"],"pdf_url":"https://arxiv.org/pdf/2501.09732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09720v1","updated":"2025-01-16T18:09:22Z","published":"2025-01-16T18:09:22Z","title":"A Simple Aerial Detection Baseline of Multimodal Language Models","summary":" The multimodal language models (MLMs) based on generative pre-trained\nTransformer are considered powerful candidates for unifying various domains and\ntasks. MLMs developed for remote sensing (RS) have demonstrated outstanding\nperformance in multiple tasks, such as visual question answering and visual\ngrounding. In addition to visual grounding that detects specific objects\ncorresponded to given instruction, aerial detection, which detects all objects\nof multiple categories, is also a valuable and challenging task for RS\nfoundation models. However, aerial detection has not been explored by existing\nRS MLMs because the autoregressive prediction mechanism of MLMs differs\nsignificantly from the detection outputs. In this paper, we present a simple\nbaseline for applying MLMs to aerial detection for the first time, named\nLMMRotate. Specifically, we first introduce a normalization method to transform\ndetection outputs into textual outputs to be compatible with the MLM framework.\nThen, we propose a evaluation method, which ensures a fair comparison between\nMLMs and conventional object detection models. We construct the baseline by\nfine-tuning open-source general-purpose MLMs and achieve impressive detection\nperformance comparable to conventional detector. We hope that this baseline\nwill serve as a reference for future MLM development, enabling more\ncomprehensive capabilities for understanding RS images. Code is available at\nhttps://github.com/Li-Qingyun/mllm-mmrotate.\n","authors":["Qingyun Li","Yushi Chen","Xinya Shu","Dong Chen","Xin He","Yi Yu","Xue Yang"],"pdf_url":"https://arxiv.org/pdf/2501.09720v1.pdf","comment":"4 pages, 1 table, 4 figures"},{"id":"http://arxiv.org/abs/2501.09718v1","updated":"2025-01-16T18:06:09Z","published":"2025-01-16T18:06:09Z","title":"FLOL: Fast Baselines for Real-World Low-Light Enhancement","summary":" Low-Light Image Enhancement (LLIE) is a key task in computational photography\nand imaging. The problem of enhancing images captured during night or in dark\nenvironments has been well-studied in the image signal processing literature.\nHowever, current deep learning-based solutions struggle with efficiency and\nrobustness in real-world scenarios (e.g. scenes with noise, saturated pixels,\nbad illumination). We propose a lightweight neural network that combines image\nprocessing in the frequency and spatial domains. Our method, FLOL+, is one of\nthe fastest models for this task, achieving state-of-the-art results on popular\nreal scenes datasets such as LOL and LSRW. Moreover, we are able to process\n1080p images under 12ms. Code and models at https://github.com/cidautai/FLOL\n","authors":["Juan C. Benito","Daniel Feijoo","Alvaro Garcia","Marcos V. Conde"],"pdf_url":"https://arxiv.org/pdf/2501.09718v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2501.09705v1","updated":"2025-01-16T17:57:53Z","published":"2025-01-16T17:57:53Z","title":"Practical Continual Forgetting for Pre-trained Vision Models","summary":" For privacy and security concerns, the need to erase unwanted information\nfrom pre-trained vision models is becoming evident nowadays. In real-world\nscenarios, erasure requests originate at any time from both users and model\nowners, and these requests usually form a sequence. Therefore, under such a\nsetting, selective information is expected to be continuously removed from a\npre-trained model while maintaining the rest. We define this problem as\ncontinual forgetting and identify three key challenges. (i) For unwanted\nknowledge, efficient and effective deleting is crucial. (ii) For remaining\nknowledge, the impact brought by the forgetting procedure should be minimal.\n(iii) In real-world scenarios, the training samples may be scarce or partially\nmissing during the process of forgetting. To address them, we first propose\nGroup Sparse LoRA (GS-LoRA). Specifically, towards (i), we introduce LoRA\nmodules to fine-tune the FFN layers in Transformer blocks for each forgetting\ntask independently, and towards (ii), a simple group sparse regularization is\nadopted, enabling automatic selection of specific LoRA groups and zeroing out\nthe others. To further extend GS-LoRA to more practical scenarios, we\nincorporate prototype information as additional supervision and introduce a\nmore practical approach, GS-LoRA++. For each forgotten class, we move the\nlogits away from its original prototype. For the remaining classes, we pull the\nlogits closer to their respective prototypes. We conduct extensive experiments\non face recognition, object detection and image classification and demonstrate\nthat our method manages to forget specific classes with minimal impact on other\nclasses. Codes have been released on https://github.com/bjzhb666/GS-LoRA.\n","authors":["Hongbo Zhao","Fei Zhu","Bolin Ni","Feng Zhu","Gaofeng Meng","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09695v1","updated":"2025-01-16T17:48:03Z","published":"2025-01-16T17:48:03Z","title":"Mitigating Hallucinations in Large Vision-Language Models via DPO:\n On-Policy Data Hold the Key","summary":" Hallucination remains a major challenge for Large Vision-Language Models\n(LVLMs). Direct Preference Optimization (DPO) has gained increasing attention\nas a simple solution to hallucination issues. It directly learns from\nconstructed preference pairs that reflect the severity of hallucinations in\nresponses to the same prompt and image. Nonetheless, different data\nconstruction methods in existing works bring notable performance variations. We\nidentify a crucial factor here: outcomes are largely contingent on whether the\nconstructed data aligns on-policy w.r.t the initial (reference) policy of DPO.\nTheoretical analysis suggests that learning from off-policy data is impeded by\nthe presence of KL-divergence between the updated policy and the reference\npolicy. From the perspective of dataset distribution, we systematically\nsummarize the inherent flaws in existing algorithms that employ DPO to address\nhallucination issues. To alleviate the problems, we propose On-Policy Alignment\n(OPA)-DPO framework, which uniquely leverages expert feedback to correct\nhallucinated responses and aligns both the original and expert-revised\nresponses in an on-policy manner. Notably, with only 4.8k data, OPA-DPO\nachieves an additional reduction in the hallucination rate of LLaVA-1.5-7B:\n13.26% on the AMBER benchmark and 5.39% on the Object-Hal benchmark, compared\nto the previous SOTA algorithm trained with 16k samples.\n","authors":["Zhihe Yang","Xufang Luo","Dongqi Han","Yunjian Xu","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2501.09695v1.pdf","comment":"18 pages, 15 figures"},{"id":"http://arxiv.org/abs/2501.09688v1","updated":"2025-01-16T17:40:19Z","published":"2025-01-16T17:40:19Z","title":"Fine-Grained Image-Text Correspondence with Cost Aggregation for\n Open-Vocabulary Part Segmentation","summary":" Open-Vocabulary Part Segmentation (OVPS) is an emerging field for recognizing\nfine-grained parts in unseen categories. We identify two primary challenges in\nOVPS: (1) the difficulty in aligning part-level image-text correspondence, and\n(2) the lack of structural understanding in segmenting object parts. To address\nthese issues, we propose PartCATSeg, a novel framework that integrates\nobject-aware part-level cost aggregation, compositional loss, and structural\nguidance from DINO. Our approach employs a disentangled cost aggregation\nstrategy that handles object and part-level costs separately, enhancing the\nprecision of part-level segmentation. We also introduce a compositional loss to\nbetter capture part-object relationships, compensating for the limited part\nannotations. Additionally, structural guidance from DINO features improves\nboundary delineation and inter-part understanding. Extensive experiments on\nPascal-Part-116, ADE20K-Part-234, and PartImageNet datasets demonstrate that\nour method significantly outperforms state-of-the-art approaches, setting a new\nbaseline for robust generalization to unseen part categories.\n","authors":["Jiho Choi","Seonho Lee","Minhyun Lee","Seungho Lee","Hyunjung Shim"],"pdf_url":"https://arxiv.org/pdf/2501.09688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01184v2","updated":"2025-01-16T17:11:06Z","published":"2025-01-02T10:21:34Z","title":"Vulnerability-Aware Spatio-Temporal Learning for Generalizable and\n Interpretable Deepfake Video Detection","summary":" Detecting deepfake videos is highly challenging due to the complex\nintertwined spatial and temporal artifacts in forged sequences. Most recent\napproaches rely on binary classifiers trained on both real and fake data.\nHowever, such methods may struggle to focus on important artifacts, which can\nhinder their generalization capability. Additionally, these models often lack\ninterpretability, making it difficult to understand how predictions are made.\nTo address these issues, we propose FakeSTormer, offering two key\ncontributions. First, we introduce a multi-task learning framework with\nadditional spatial and temporal branches that enable the model to focus on\nsubtle spatio-temporal artifacts. These branches also provide interpretability\nby highlighting video regions that may contain artifacts. Second, we propose a\nvideo-level data synthesis algorithm that generates pseudo-fake videos with\nsubtle artifacts, providing the model with high-quality samples and ground\ntruth data for our spatial and temporal branches. Extensive experiments on\nseveral challenging benchmarks demonstrate the competitiveness of our approach\ncompared to recent state-of-the-art methods. The code is available at\nhttps://github.com/10Ring/FakeSTormer.\n","authors":["Dat Nguyen","Marcella Astrid","Anis Kacem","Enjie Ghorbel","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2501.01184v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.05728v2","updated":"2025-01-16T17:09:57Z","published":"2025-01-10T05:53:32Z","title":"Super-class guided Transformer for Zero-Shot Attribute Classification","summary":" Attribute classification is crucial for identifying specific characteristics\nwithin image regions. Vision-Language Models (VLMs) have been effective in\nzero-shot tasks by leveraging their general knowledge from large-scale\ndatasets. Recent studies demonstrate that transformer-based models with\nclass-wise queries can effectively address zero-shot multi-label\nclassification. However, poor utilization of the relationship between seen and\nunseen attributes makes the model lack generalizability. Additionally,\nattribute classification generally involves many attributes, making maintaining\nthe model's scalability difficult. To address these issues, we propose\nSuper-class guided transFormer (SugaFormer), a novel framework that leverages\nsuper-classes to enhance scalability and generalizability for zero-shot\nattribute classification. SugaFormer employs Super-class Query Initialization\n(SQI) to reduce the number of queries, utilizing common semantic information\nfrom super-classes, and incorporates Multi-context Decoding (MD) to handle\ndiverse visual cues. To strengthen generalizability, we introduce two knowledge\ntransfer strategies that utilize VLMs. During training, Super-class guided\nConsistency Regularization (SCR) aligns model's features with VLMs using\nsuper-class guided prompts, and during inference, Zero-shot Retrieval-based\nScore Enhancement (ZRSE) refines predictions for unseen attributes. Extensive\nexperiments demonstrate that SugaFormer achieves state-of-the-art performance\nacross three widely-used attribute classification benchmarks under zero-shot,\nand cross-dataset transfer settings. Our code is available at\nhttps://github.com/mlvlab/SugaFormer.\n","authors":["Sehyung Kim","Chanhyeong Yang","Jihwan Park","Taehoon Song","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2501.05728v2.pdf","comment":"AAAI25"},{"id":"http://arxiv.org/abs/2501.09672v1","updated":"2025-01-16T17:08:12Z","published":"2025-01-16T17:08:12Z","title":"Robin: a Suite of Multi-Scale Vision-Language Models and the CHIRP\n Evaluation Benchmark","summary":" The proliferation of Vision-Language Models (VLMs) in the past several years\ncalls for rigorous and comprehensive evaluation methods and benchmarks. This\nwork analyzes existing VLM evaluation techniques, including automated metrics,\nAI-based assessments, and human evaluations across diverse tasks. We first\nintroduce Robin - a novel suite of VLMs that we built by combining Large\nLanguage Models (LLMs) and Vision Encoders (VEs) at multiple scales, and use\nRobin to identify shortcomings of current evaluation approaches across scales.\nNext, to overcome the identified limitations, we introduce CHIRP - a new long\nform response benchmark we developed for more robust and complete VLM\nevaluation. We provide open access to the Robin training code, model suite, and\nCHIRP benchmark to promote reproducibility and advance VLM research.\n","authors":["Alexis Roger","Prateek Humane","Daniel Z. Kaplan","Kshitij Gupta","Qi Sun","George Adamopoulos","Jonathan Siu Chi Lim","Quentin Anthony","Edwin Fennell","Irina Rish"],"pdf_url":"https://arxiv.org/pdf/2501.09672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01034v2","updated":"2025-01-16T16:45:29Z","published":"2024-02-01T21:45:12Z","title":"VIS-MAE: An Efficient Self-supervised Learning Approach on Medical Image\n Segmentation and Classification","summary":" Artificial Intelligence (AI) has the potential to revolutionize diagnosis and\nsegmentation in medical imaging. However, development and clinical\nimplementation face multiple challenges including limited data availability,\nlack of generalizability, and the necessity to incorporate multi-modal data\neffectively. A foundation model, which is a large-scale pre-trained AI model,\noffers a versatile base that can be adapted to a variety of specific tasks and\ncontexts. Here, we present VIsualization and Segmentation Masked AutoEncoder\n(VIS-MAE), novel model weights specifically designed for medical imaging.\nSpecifically, VIS-MAE is trained on a dataset of 2.5 million unlabeled images\nfrom various modalities (CT, MR, PET,X-rays, and ultrasound), using\nself-supervised learning techniques. It is then adapted to classification and\nsegmentation tasks using explicit labels. VIS-MAE has high label efficiency,\noutperforming several benchmark models in both in-domain and out-of-domain\napplications. In addition, VIS-MAE has improved label efficiency as it can\nachieve similar performance to other models with a reduced amount of labeled\ntraining data (50% or 80%) compared to other pre-trained weights. VIS-MAE\nrepresents a significant advancement in medical imaging AI, offering a\ngeneralizable and robust solution for improving segmentation and classification\ntasks while reducing the data annotation workload. The source code of this work\nis available at https://github.com/lzl199704/VIS-MAE.\n","authors":["Zelong Liu","Andrew Tieu","Nikhil Patel","Georgios Soultanidis","Louisa Deyer","Ying Wang","Sean Huver","Alexander Zhou","Yunhao Mei","Zahi A. Fayad","Timothy Deyer","Xueyan Mei"],"pdf_url":"https://arxiv.org/pdf/2402.01034v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17097v2","updated":"2025-01-16T16:27:33Z","published":"2024-05-27T12:12:26Z","title":"A Comparative Study on Multi-task Uncertainty Quantification in Semantic\n Segmentation and Monocular Depth Estimation","summary":" Deep neural networks excel in perception tasks such as semantic segmentation\nand monocular depth estimation, making them indispensable in safety-critical\napplications like autonomous driving and industrial inspection. However, they\noften suffer from overconfidence and poor explainability, especially for\nout-of-domain data. While uncertainty quantification has emerged as a promising\nsolution to these challenges, multi-task settings have yet to be explored. In\nan effort to shed light on this, we evaluate Monte Carlo Dropout, Deep\nSub-Ensembles, and Deep Ensembles for joint semantic segmentation and monocular\ndepth estimation. Thereby, we reveal that Deep Ensembles stand out as the\npreferred choice, particularly in out-of-domain scenarios, and show the\npotential benefit of multi-task learning with regard to the uncertainty quality\nin comparison to solving both tasks separately. Additionally, we highlight the\nimpact of employing different uncertainty thresholds to classify pixels as\ncertain or uncertain, with the median uncertainty emerging as a robust default.\n","authors":["Steven Landgraf","Markus Hillemann","Theodor Kapler","Markus Ulrich"],"pdf_url":"https://arxiv.org/pdf/2405.17097v2.pdf","comment":"This manuscript is an extended version of a previously published\n conference paper and is currently in review for a journal"},{"id":"http://arxiv.org/abs/2501.09635v1","updated":"2025-01-16T16:24:21Z","published":"2025-01-16T16:24:21Z","title":"Unified Face Matching and Physical-Digital Spoofing Attack Detection","summary":" Face recognition technology has dramatically transformed the landscape of\nsecurity, surveillance, and authentication systems, offering a user-friendly\nand non-invasive biometric solution. However, despite its significant\nadvantages, face recognition systems face increasing threats from physical and\ndigital spoofing attacks. Current research typically treats face recognition\nand attack detection as distinct classification challenges. This approach\nnecessitates the implementation of separate models for each task, leading to\nconsiderable computational complexity, particularly on devices with limited\nresources. Such inefficiencies can stifle scalability and hinder performance.\nIn response to these challenges, this paper introduces an innovative unified\nmodel designed for face recognition and detection of physical and digital\nattacks. By leveraging the advanced Swin Transformer backbone and incorporating\nHiLo attention in a convolutional neural network framework, we address unified\nface recognition and spoof attack detection more effectively. Moreover, we\nintroduce augmentation techniques that replicate the traits of physical and\ndigital spoofing cues, significantly enhancing our model robustness. Through\ncomprehensive experimental evaluation across various datasets, we showcase the\neffectiveness of our model in unified face recognition and spoof detection.\nAdditionally, we confirm its resilience against unseen physical and digital\nspoofing attacks, underscoring its potential for real-world applications.\n","authors":["Arun Kunwar","Ajita Rattani"],"pdf_url":"https://arxiv.org/pdf/2501.09635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10729v3","updated":"2025-01-16T16:04:07Z","published":"2024-06-15T20:04:06Z","title":"A Comprehensive Survey of Foundation Models in Medicine","summary":" Foundation models (FMs) are large-scale deep learning models trained on\nmassive datasets, often using self-supervised learning techniques. These models\nserve as a versatile base for a wide range of downstream tasks, including those\nin medicine and healthcare. FMs have demonstrated remarkable success across\nmultiple healthcare domains. However, existing surveys in this field do not\ncomprehensively cover all areas where FMs have made significant strides. In\nthis survey, we present a comprehensive review of FMs in medicine, focusing on\ntheir evolution, learning strategies, flagship models, applications, and\nassociated challenges. We examine how prominent FMs, such as the BERT and GPT\nfamilies, are transforming various aspects of healthcare, including clinical\nlarge language models, medical image analysis, and omics research.\nAdditionally, we provide a detailed taxonomy of FM-enabled healthcare\napplications, spanning clinical natural language processing, medical computer\nvision, graph learning, and other biology- and omics- related tasks. Despite\nthe transformative potentials of FMs, they also pose unique challenges. This\nsurvey delves into these challenges and highlights open research questions and\nlessons learned to guide researchers and practitioners. Our goal is to provide\nvaluable insights into the capabilities of FMs in health, facilitating\nresponsible deployment and mitigating associated risks.\n","authors":["Wasif Khan","Seowung Leem","Kyle B. See","Joshua K. Wong","Shaoting Zhang","Ruogu Fang"],"pdf_url":"https://arxiv.org/pdf/2406.10729v3.pdf","comment":"Currently under review in IEEE REVIEWS IN BIOMEDICAL ENGINEERING"},{"id":"http://arxiv.org/abs/2501.05555v2","updated":"2025-01-16T16:00:37Z","published":"2025-01-09T20:02:10Z","title":"Improving Zero-Shot Object-Level Change Detection by Incorporating\n Visual Correspondence","summary":" Detecting object-level changes between two images across possibly different\nviews is a core task in many applications that involve visual inspection or\ncamera surveillance. Existing change-detection approaches suffer from three\nmajor limitations: (1) lack of evaluation on image pairs that contain no\nchanges, leading to unreported false positive rates; (2) lack of\ncorrespondences (i.e., localizing the regions before and after a change); and\n(3) poor zero-shot generalization across different domains. To address these\nissues, we introduce a novel method that leverages change correspondences (a)\nduring training to improve change detection accuracy, and (b) at test time, to\nminimize false positives. That is, we harness the supervision labels of where\nan object is added or removed to supervise change detectors, improving their\naccuracy over previous work by a large margin. Our work is also the first to\npredict correspondences between pairs of detected changes using estimated\nhomography and the Hungarian algorithm. Our model demonstrates superior\nperformance over existing methods, achieving state-of-the-art results in change\ndetection and change correspondence accuracy across both in-distribution and\nzero-shot benchmarks.\n","authors":["Hung Huy Nguyen","Pooyan Rahmanzadehgervi","Long Mai","Anh Totti Nguyen"],"pdf_url":"https://arxiv.org/pdf/2501.05555v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09617v1","updated":"2025-01-16T15:44:24Z","published":"2025-01-16T15:44:24Z","title":"WMamba: Wavelet-based Mamba for Face Forgery Detection","summary":" With the rapid advancement of deepfake generation technologies, the demand\nfor robust and accurate face forgery detection algorithms has become\nincreasingly critical. Recent studies have demonstrated that wavelet analysis\ncan uncover subtle forgery artifacts that remain imperceptible in the spatial\ndomain. Wavelets effectively capture important facial contours, which are often\nslender, fine-grained, and global in nature. However, existing wavelet-based\napproaches fail to fully leverage these unique characteristics, resulting in\nsub-optimal feature extraction and limited generalizability. To address this\nchallenge, we introduce WMamba, a novel wavelet-based feature extractor built\nupon the Mamba architecture. WMamba maximizes the utility of wavelet\ninformation through two key innovations. First, we propose Dynamic Contour\nConvolution (DCConv), which employs specially crafted deformable kernels to\nadaptively model slender facial contours. Second, by leveraging the Mamba\narchitecture, our method captures long-range spatial relationships with linear\ncomputational complexity. This efficiency allows for the extraction of\nfine-grained, global forgery artifacts from small image patches. Extensive\nexperimental results show that WMamba achieves state-of-the-art (SOTA)\nperformance, highlighting its effectiveness and superiority in face forgery\ndetection.\n","authors":["Siran Peng","Tianshuo Zhang","Li Gao","Xiangyu Zhu","Haoyuan Zhang","Kai Pang","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2501.09617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09608v1","updated":"2025-01-16T15:32:41Z","published":"2025-01-16T15:32:41Z","title":"Metric Learning with Progressive Self-Distillation for Audio-Visual\n Embedding Learning","summary":" Metric learning projects samples into an embedded space, where similarities\nand dissimilarities are quantified based on their learned representations.\nHowever, existing methods often rely on label-guided representation learning,\nwhere representations of different modalities, such as audio and visual data,\nare aligned based on annotated labels. This approach tends to underutilize\nlatent complex features and potential relationships inherent in the\ndistributions of audio and visual data that are not directly tied to the\nlabels, resulting in suboptimal performance in audio-visual embedding learning.\nTo address this issue, we propose a novel architecture that integrates\ncross-modal triplet loss with progressive self-distillation. Our method\nenhances representation learning by leveraging inherent distributions and\ndynamically refining soft audio-visual alignments -- probabilistic alignments\nbetween audio and visual data that capture the inherent relationships beyond\nexplicit labels. Specifically, the model distills audio-visual\ndistribution-based knowledge from annotated labels in a subset of each batch.\nThis self-distilled knowledge is used t\n","authors":["Donghuo Zeng","Kazushi Ikeda"],"pdf_url":"https://arxiv.org/pdf/2501.09608v1.pdf","comment":"5 pages, 3 figures, 2 tables. Accepted by ICASSP 2025"},{"id":"http://arxiv.org/abs/2501.07227v2","updated":"2025-01-16T15:30:54Z","published":"2025-01-13T11:28:49Z","title":"MECD+: Unlocking Event-Level Causal Graph Discovery for Video Reasoning","summary":" Video causal reasoning aims to achieve a high-level understanding of videos\nfrom a causal perspective. However, it exhibits limitations in its scope,\nprimarily executed in a question-answering paradigm and focusing on brief video\nsegments containing isolated events and basic causal relations, lacking\ncomprehensive and structured causality analysis for videos with multiple\ninterconnected events. To fill this gap, we introduce a new task and dataset,\nMulti-Event Causal Discovery (MECD). It aims to uncover the causal relations\nbetween events distributed chronologically across long videos. Given visual\nsegments and textual descriptions of events, MECD identifies the causal\nassociations between these events to derive a comprehensive and structured\nevent-level video causal graph explaining why and how the result event\noccurred. To address the challenges of MECD, we devise a novel framework\ninspired by the Granger Causality method, incorporating an efficient mask-based\nevent prediction model to perform an Event Granger Test. It estimates causality\nby comparing the predicted result event when premise events are masked versus\nunmasked. Furthermore, we integrate causal inference techniques such as\nfront-door adjustment and counterfactual inference to mitigate challenges in\nMECD like causality confounding and illusory causality. Additionally, context\nchain reasoning is introduced to conduct more robust and generalized reasoning.\nExperiments validate the effectiveness of our framework in reasoning complete\ncausal relations, outperforming GPT-4o and VideoChat2 by 5.77% and 2.70%,\nrespectively. Further experiments demonstrate that causal relation graphs can\nalso contribute to downstream video understanding tasks such as video question\nanswering and video event prediction.\n","authors":["Tieyuan Chen","Huabin Liu","Yi Wang","Yihang Chen","Tianyao He","Chaofan Gan","Huanyu He","Weiyao Lin"],"pdf_url":"https://arxiv.org/pdf/2501.07227v2.pdf","comment":"IEEE TPAMI Submission. continuous work of arXiv:2409.17647 (NeurIPS\n 2024)"},{"id":"http://arxiv.org/abs/2501.09600v1","updated":"2025-01-16T15:22:06Z","published":"2025-01-16T15:22:06Z","title":"Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid\n Prototyping in Virtual Reality Applications","summary":" SLAM is a foundational technique with broad applications in robotics and\nAR/VR. SLAM simulations evaluate new concepts, but testing on\nresource-constrained devices, such as VR HMDs, faces challenges: high\ncomputational cost and restricted sensor data access. This work proposes a\nsparse framework using mesh geometry projections as features, which improves\nefficiency and circumvents direct sensor data access, advancing SLAM research\nas we demonstrate in VR and through numerical evaluation.\n","authors":["Carlos Augusto Pinheiro de Sousa","Heiko Hamann","Oliver Deussen"],"pdf_url":"https://arxiv.org/pdf/2501.09600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.01957v2","updated":"2025-01-16T15:00:16Z","published":"2025-01-03T18:59:52Z","title":"VITA-1.5: Towards GPT-4o Level Real-Time Vision and Speech Interaction","summary":" Recent Multimodal Large Language Models (MLLMs) have typically focused on\nintegrating visual and textual modalities, with less emphasis placed on the\nrole of speech in enhancing interaction. However, speech plays a crucial role\nin multimodal dialogue systems, and implementing high-performance in both\nvision and speech tasks remains a significant challenge due to the fundamental\nmodality differences. In this paper, we propose a carefully designed\nmulti-stage training methodology that progressively trains LLM to understand\nboth visual and speech information, ultimately enabling fluent vision and\nspeech interaction. Our approach not only preserves strong vision-language\ncapacity, but also enables efficient speech-to-speech dialogue capabilities\nwithout separate ASR and TTS modules, significantly accelerating multimodal\nend-to-end response speed. By comparing our method against state-of-the-art\ncounterparts across benchmarks for image, video, and speech tasks, we\ndemonstrate that our model is equipped with both strong visual and speech\ncapabilities, making near real-time vision and speech interaction.\n","authors":["Chaoyou Fu","Haojia Lin","Xiong Wang","Yi-Fan Zhang","Yunhang Shen","Xiaoyu Liu","Yangze Li","Zuwei Long","Heting Gao","Ke Li","Long Ma","Xiawu Zheng","Rongrong Ji","Xing Sun","Caifeng Shan","Ran He"],"pdf_url":"https://arxiv.org/pdf/2501.01957v2.pdf","comment":"https://github.com/VITA-MLLM/VITA"},{"id":"http://arxiv.org/abs/2501.09579v1","updated":"2025-01-16T14:56:41Z","published":"2025-01-16T14:56:41Z","title":"Sequential PatchCore: Anomaly Detection for Surface Inspection using\n Synthetic Impurities","summary":" The appearance of surface impurities (e.g., water stains, fingerprints,\nstickers) is an often-mentioned issue that causes degradation of automated\nvisual inspection systems. At the same time, synthetic data generation\ntechniques for visual surface inspection have focused primarily on generating\nperfect examples and defects, disregarding impurities. This study highlights\nthe importance of considering impurities when generating synthetic data. We\nintroduce a procedural method to include photorealistic water stains in\nsynthetic data. The synthetic datasets are generated to correspond to real\ndatasets and are further used to train an anomaly detection model and\ninvestigate the influence of water stains. The high-resolution images used for\nsurface inspection lead to memory bottlenecks during anomaly detection\ntraining. To address this, we introduce Sequential PatchCore - a method to\nbuild coresets sequentially and make training on large images using\nconsumer-grade hardware tractable. This allows us to perform transfer learning\nusing coresets pre-trained on different dataset versions. Our results show the\nbenefits of using synthetic data for pre-training an explicit coreset anomaly\nmodel and the extended performance benefits of finetuning the coreset using\nreal data. We observed how the impurities and labelling ambiguity lower the\nmodel performance and have additionally reported the defect-wise recall to\nprovide an industrially relevant perspective on model performance.\n","authors":["Runzhou Mao","Juraj Fulir","Christoph Garth","Petra Gospodnetić"],"pdf_url":"https://arxiv.org/pdf/2501.09579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20891v4","updated":"2025-01-16T14:45:36Z","published":"2024-07-30T15:07:13Z","title":"Bayesian Low-Rank LeArning (Bella): A Practical Approach to Bayesian\n Neural Networks","summary":" Computational complexity of Bayesian learning is impeding its adoption in\npractical, large-scale tasks. Despite demonstrations of significant merits such\nas improved robustness and resilience to unseen or out-of-distribution inputs\nover their non- Bayesian counterparts, their practical use has faded to near\ninsignificance. In this study, we introduce an innovative framework to mitigate\nthe computational burden of Bayesian neural networks (BNNs). Our approach\nfollows the principle of Bayesian techniques based on deep ensembles, but\nsignificantly reduces their cost via multiple low-rank perturbations of\nparameters arising from a pre-trained neural network. Both vanilla version of\nensembles as well as more sophisticated schemes such as Bayesian learning with\nStein Variational Gradient Descent (SVGD), previously deemed impractical for\nlarge models, can be seamlessly implemented within the proposed framework,\ncalled Bayesian Low-Rank LeArning (Bella). In a nutshell, i) Bella achieves a\ndramatic reduction in the number of trainable parameters required to\napproximate a Bayesian posterior; and ii) it not only maintains, but in some\ninstances, surpasses the performance of conventional Bayesian learning methods\nand non-Bayesian baselines. Our results with large-scale tasks such as\nImageNet, CAMELYON17, DomainNet, VQA with CLIP, LLaVA demonstrate the\neffectiveness and versatility of Bella in building highly scalable and\npractical Bayesian deep models for real-world applications.\n","authors":["Bao Gia Doan","Afshar Shamsi","Xiao-Yu Guo","Arash Mohammadi","Hamid Alinejad-Rokny","Dino Sejdinovic","Damien Teney","Damith C. Ranasinghe","Ehsan Abbasnejad"],"pdf_url":"https://arxiv.org/pdf/2407.20891v4.pdf","comment":"This paper is accepted in AAAI'2025"},{"id":"http://arxiv.org/abs/2412.04755v2","updated":"2025-01-16T14:44:39Z","published":"2024-12-06T03:40:21Z","title":"Latent Space Characterization of Autoencoder Variants","summary":" Understanding the latent spaces learned by deep learning models is crucial in\nexploring how they represent and generate complex data. Autoencoders (AEs) have\nplayed a key role in the area of representation learning, with numerous\nregularization techniques and training principles developed not only to enhance\ntheir ability to learn compact and robust representations, but also to reveal\nhow different architectures influence the structure and smoothness of the\nlower-dimensional non-linear manifold. We strive to characterize the structure\nof the latent spaces learned by different autoencoders including convolutional\nautoencoders (CAEs), denoising autoencoders (DAEs), and variational\nautoencoders (VAEs) and how they change with the perturbations in the input. By\ncharacterizing the matrix manifolds corresponding to the latent spaces, we\nprovide an explanation for the well-known observation that the latent spaces of\nCAE and DAE form non-smooth manifolds, while that of VAE forms a smooth\nmanifold. We also map the points of the matrix manifold to a Hilbert space\nusing distance preserving transforms and provide an alternate view in terms of\nthe subspaces generated in the Hilbert space as a function of the distortion in\nthe input. The results show that the latent manifolds of CAE and DAE are\nstratified with each stratum being a smooth product manifold, while the\nmanifold of VAE is a smooth product manifold of two symmetric positive definite\nmatrices and a symmetric positive semi-definite matrix.\n","authors":["Anika Shrivastava","Renu Rameshan","Samar Agnihotri"],"pdf_url":"https://arxiv.org/pdf/2412.04755v2.pdf","comment":"9 pages, 6 figures, and 1 table"},{"id":"http://arxiv.org/abs/2501.09565v1","updated":"2025-01-16T14:40:02Z","published":"2025-01-16T14:40:02Z","title":"A New Teacher-Reviewer-Student Framework for Semi-supervised 2D Human\n Pose Estimation","summary":" Conventional 2D human pose estimation methods typically require extensive\nlabeled annotations, which are both labor-intensive and expensive. In contrast,\nsemi-supervised 2D human pose estimation can alleviate the above problems by\nleveraging a large amount of unlabeled data along with a small portion of\nlabeled data. Existing semi-supervised 2D human pose estimation methods update\nthe network through backpropagation, ignoring crucial historical information\nfrom the previous training process. Therefore, we propose a novel\nsemi-supervised 2D human pose estimation method by utilizing a newly designed\nTeacher-Reviewer-Student framework. Specifically, we first mimic the phenomenon\nthat human beings constantly review previous knowledge for consolidation to\ndesign our framework, in which the teacher predicts results to guide the\nstudent's learning and the reviewer stores important historical parameters to\nprovide additional supervision signals. Secondly, we introduce a Multi-level\nFeature Learning strategy, which utilizes the outputs from different stages of\nthe backbone to estimate the heatmap to guide network training, enriching the\nsupervisory information while effectively capturing keypoint relationships.\nFinally, we design a data augmentation strategy, i.e., Keypoint-Mix, to perturb\npose information by mixing different keypoints, thus enhancing the network's\nability to discern keypoints. Extensive experiments on publicly available\ndatasets, demonstrate our method achieves significant improvements compared to\nthe existing methods.\n","authors":["Wulian Yun","Mengshi Qi","Fei Peng","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2501.09565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09555v1","updated":"2025-01-16T14:18:06Z","published":"2025-01-16T14:18:06Z","title":"Text-driven Adaptation of Foundation Models for Few-shot Surgical\n Workflow Analysis","summary":" Purpose: Surgical workflow analysis is crucial for improving surgical\nefficiency and safety. However, previous studies rely heavily on large-scale\nannotated datasets, posing challenges in cost, scalability, and reliance on\nexpert annotations. To address this, we propose Surg-FTDA (Few-shot Text-driven\nAdaptation), designed to handle various surgical workflow analysis tasks with\nminimal paired image-label data.\n Methods: Our approach has two key components. First, Few-shot selection-based\nmodality alignment selects a small subset of images and aligns their embeddings\nwith text embeddings from the downstream task, bridging the modality gap.\nSecond, Text-driven adaptation leverages only text data to train a decoder,\neliminating the need for paired image-text data. This decoder is then applied\nto aligned image embeddings, enabling image-related tasks without explicit\nimage-text pairs.\n Results: We evaluate our approach to generative tasks (image captioning) and\ndiscriminative tasks (triplet recognition and phase recognition). Results show\nthat Surg-FTDA outperforms baselines and generalizes well across downstream\ntasks.\n Conclusion: We propose a text-driven adaptation approach that mitigates the\nmodality gap and handles multiple downstream tasks in surgical workflow\nanalysis, with minimal reliance on large annotated datasets. The code and\ndataset will be released in https://github.com/TingxuanSix/Surg-FTDA.\n","authors":["Tingxuan Chen","Kun Yuan","Vinkle Srivastav","Nassir Navab","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2501.09555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09552v1","updated":"2025-01-16T14:12:33Z","published":"2025-01-16T14:12:33Z","title":"Exploring AI-based System Design for Pixel-level Protected Health\n Information Detection in Medical Images","summary":" De-identification of medical images is a critical step to ensure privacy\nduring data sharing in research and clinical settings. The initial step in this\nprocess involves detecting Protected Health Information (PHI), which can be\nfound in image metadata or imprinted within image pixels. Despite the\nimportance of such systems, there has been limited evaluation of existing\nAI-based solutions, creating barriers to the development of reliable and robust\ntools. In this study, we present an AI-based pipeline for PHI detection,\ncomprising three key components: text detection, text extraction, and analysis\nof PHI content in medical images. By experimenting with exchanging roles of\nvision and language models within the pipeline, we evaluate the performance and\nrecommend the best setup for the PHI detection task.\n","authors":["Tuan Truong","Ivo M. Baltruschat","Mark Klemens","Grit Werner","Matthias Lenga"],"pdf_url":"https://arxiv.org/pdf/2501.09552v1.pdf","comment":"In progress"},{"id":"http://arxiv.org/abs/2404.14388v3","updated":"2025-01-16T14:02:26Z","published":"2024-04-22T17:46:29Z","title":"STROOBnet Optimization via GPU-Accelerated Proximal Recurrence\n Strategies","summary":" Spatiotemporal networks' observational capabilities are crucial for accurate\ndata gathering and informed decisions across multiple sectors. This study\nfocuses on the Spatiotemporal Ranged Observer-Observable Bipartite Network\n(STROOBnet), linking observational nodes (e.g., surveillance cameras) to events\nwithin defined geographical regions, enabling efficient monitoring. Using data\nfrom Real-Time Crime Camera (RTCC) systems and Calls for Service (CFS) in New\nOrleans, where RTCC combats rising crime amidst reduced police presence, we\naddress the network's initial observational imbalances. Aiming for uniform\nobservational efficacy, we propose the Proximal Recurrence approach. It\noutperformed traditional clustering methods like k-means and DBSCAN by offering\nholistic event frequency and spatial consideration, enhancing observational\ncoverage.\n","authors":["Ted Edward Holmberg","Mahdi Abdelguerfi","Elias Ioup"],"pdf_url":"https://arxiv.org/pdf/2404.14388v3.pdf","comment":"10 pages, 17 figures, 2023 IEEE International Conference on Big Data\n (BigData)"},{"id":"http://arxiv.org/abs/2409.07989v2","updated":"2025-01-16T14:01:58Z","published":"2024-09-12T12:34:29Z","title":"Enhancing Few-Shot Image Classification through Learnable Multi-Scale\n Embedding and Attention Mechanisms","summary":" In the context of few-shot classification, the goal is to train a classifier\nusing a limited number of samples while maintaining satisfactory performance.\nHowever, traditional metric-based methods exhibit certain limitations in\nachieving this objective. These methods typically rely on a single distance\nvalue between the query feature and support feature, thereby overlooking the\ncontribution of shallow features. To overcome this challenge, we propose a\nnovel approach in this paper. Our approach involves utilizing a multi-output\nembedding network that maps samples into distinct feature spaces. The proposed\nmethod extracts feature vectors at different stages, enabling the model to\ncapture both global and abstract features. By utilizing these diverse feature\nspaces, our model enhances its performance. Moreover, employing a\nself-attention mechanism improves the refinement of features at each stage,\nleading to even more robust representations and improved overall performance.\nFurthermore, assigning learnable weights to each stage significantly improved\nperformance and results. We conducted comprehensive evaluations on the\nMiniImageNet and FC100 datasets, specifically in the 5-way 1-shot and 5-way\n5-shot scenarios. Additionally, we performed cross-domain tasks across eight\nbenchmark datasets, achieving high accuracy in the testing domains. These\nevaluations demonstrate the efficacy of our proposed method in comparison to\nstate-of-the-art approaches. https://github.com/FatemehAskari/MSENet\n","authors":["Fatemeh Askari","Amirreza Fateh","Mohammad Reza Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2409.07989v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09532v1","updated":"2025-01-16T13:34:33Z","published":"2025-01-16T13:34:33Z","title":"AdaFV: Accelerating VLMs with Self-Adaptive Cross-Modality Attention\n Mixture","summary":" The success of VLMs often relies on the dynamic high-resolution schema that\nadaptively augments the input images to multiple crops, so that the details of\nthe images can be retained. However, such approaches result in a large number\nof redundant visual tokens, thus significantly reducing the efficiency of the\nVLMs. To improve the VLMs' efficiency without introducing extra training costs,\nmany research works are proposed to reduce the visual tokens by filtering the\nuninformative visual tokens or aggregating their information. Some approaches\npropose to reduce the visual tokens according to the self-attention of VLMs,\nwhich are biased, to result in inaccurate responses. The token reduction\napproaches solely rely on visual cues are text-agnostic, and fail to focus on\nthe areas that are most relevant to the question, especially when the queried\nobjects are non-salient to the image. In this work, we first conduct\nexperiments to show that the original text embeddings are aligned with the\nvisual tokens, without bias on the tailed visual tokens. We then propose a\nself-adaptive cross-modality attention mixture mechanism that dynamically\nleverages the effectiveness of visual saliency and text-to-image similarity in\nthe pre-LLM layers to select the visual tokens that are informative. Extensive\nexperiments demonstrate that the proposed approach achieves state-of-the-art\ntraining-free VLM acceleration performance, especially when the reduction rate\nis sufficiently large.\n","authors":["Jiayi Han","Liang Du","Yiwen Wu","Xiangguo Zhou","Hongwei Du","Weibo Zheng"],"pdf_url":"https://arxiv.org/pdf/2501.09532v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2410.24031v3","updated":"2025-01-16T13:20:56Z","published":"2024-10-31T15:29:51Z","title":"A Multi-Modal Approach for Face Anti-Spoofing in Non-Calibrated Systems\n using Disparity Maps","summary":" Face recognition technologies are increasingly used in various applications,\nyet they are vulnerable to face spoofing attacks. These spoofing attacks often\ninvolve unique 3D structures, such as printed papers or mobile device screens.\nAlthough stereo-depth cameras can detect such attacks effectively, their\nhigh-cost limits their widespread adoption. Conversely, two-sensor systems\nwithout extrinsic calibration offer a cost-effective alternative but are unable\nto calculate depth using stereo techniques. In this work, we propose a method\nto overcome this challenge by leveraging facial attributes to derive disparity\ninformation and estimate relative depth for anti-spoofing purposes, using\nnon-calibrated systems. We introduce a multi-modal anti-spoofing model, coined\nDisparity Model, that incorporates created disparity maps as a third modality\nalongside the two original sensor modalities. We demonstrate the effectiveness\nof the Disparity Model in countering various spoof attacks using a\ncomprehensive dataset collected from the Intel RealSense ID Solution F455. Our\nmethod outperformed existing methods in the literature, achieving an Equal\nError Rate (EER) of 1.71% and a False Negative Rate (FNR) of 2.77% at a False\nPositive Rate (FPR) of 1%. These errors are lower by 2.45% and 7.94% than the\nerrors of the best comparison method, respectively. Additionally, we introduce\na model ensemble that addresses 3D spoof attacks as well, achieving an EER of\n2.04% and an FNR of 3.83% at an FPR of 1%. Overall, our work provides a\nstate-of-the-art solution for the challenging task of anti-spoofing in\nnon-calibrated systems that lack depth information.\n","authors":["Ariel Larey","Eyal Rond","Omer Achrack"],"pdf_url":"https://arxiv.org/pdf/2410.24031v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09504v1","updated":"2025-01-16T12:33:48Z","published":"2025-01-16T12:33:48Z","title":"HydraMix: Multi-Image Feature Mixing for Small Data Image Classification","summary":" Training deep neural networks requires datasets with a large number of\nannotated examples. The collection and annotation of these datasets is not only\nextremely expensive but also faces legal and privacy problems. These factors\nare a significant limitation for many real-world applications. To address this,\nwe introduce HydraMix, a novel architecture that generates new image\ncompositions by mixing multiple different images from the same class. HydraMix\nlearns the fusion of the content of various images guided by a\nsegmentation-based mixing mask in feature space and is optimized via a\ncombination of unsupervised and adversarial training. Our data augmentation\nscheme allows the creation of models trained from scratch on very small\ndatasets. We conduct extensive experiments on ciFAIR-10, STL-10, and\nciFAIR-100. Additionally, we introduce a novel text-image metric to assess the\ngenerality of the augmented datasets. Our results show that HydraMix\noutperforms existing state-of-the-art methods for image classification on small\ndatasets.\n","authors":["Christoph Reinders","Frederik Schubert","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2501.09504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09503v1","updated":"2025-01-16T12:28:39Z","published":"2025-01-16T12:28:39Z","title":"AnyStory: Towards Unified Single and Multiple Subject Personalization in\n Text-to-Image Generation","summary":" Recently, large-scale generative models have demonstrated outstanding\ntext-to-image generation capabilities. However, generating high-fidelity\npersonalized images with specific subjects still presents challenges,\nespecially in cases involving multiple subjects. In this paper, we propose\nAnyStory, a unified approach for personalized subject generation. AnyStory not\nonly achieves high-fidelity personalization for single subjects, but also for\nmultiple subjects, without sacrificing subject fidelity. Specifically, AnyStory\nmodels the subject personalization problem in an \"encode-then-route\" manner. In\nthe encoding step, AnyStory utilizes a universal and powerful image encoder,\ni.e., ReferenceNet, in conjunction with CLIP vision encoder to achieve\nhigh-fidelity encoding of subject features. In the routing step, AnyStory\nutilizes a decoupled instance-aware subject router to accurately perceive and\npredict the potential location of the corresponding subject in the latent\nspace, and guide the injection of subject conditions. Detailed experimental\nresults demonstrate the excellent performance of our method in retaining\nsubject details, aligning text descriptions, and personalizing for multiple\nsubjects. The project page is at https://aigcdesigngroup.github.io/AnyStory/ .\n","authors":["Junjie He","Yuxiang Tuo","Binghui Chen","Chongyang Zhong","Yifeng Geng","Liefeng Bo"],"pdf_url":"https://arxiv.org/pdf/2501.09503v1.pdf","comment":"Tech report; Project page:\n https://aigcdesigngroup.github.io/AnyStory/"},{"id":"http://arxiv.org/abs/2501.09502v1","updated":"2025-01-16T12:27:05Z","published":"2025-01-16T12:27:05Z","title":"Omni-Emotion: Extending Video MLLM with Detailed Face and Audio Modeling\n for Multimodal Emotion Analysis","summary":" Understanding emotions accurately is essential for fields like human-computer\ninteraction. Due to the complexity of emotions and their multi-modal nature\n(e.g., emotions are influenced by facial expressions and audio), researchers\nhave turned to using multi-modal models to understand human emotions rather\nthan single-modality. However, current video multi-modal large language models\n(MLLMs) encounter difficulties in effectively integrating audio and identifying\nsubtle facial micro-expressions. Furthermore, the lack of detailed emotion\nanalysis datasets also limits the development of multimodal emotion analysis.\nTo address these issues, we introduce a self-reviewed dataset and a\nhuman-reviewed dataset, comprising 24,137 coarse-grained samples and 3,500\nmanually annotated samples with detailed emotion annotations, respectively.\nThese datasets allow models to learn from diverse scenarios and better\ngeneralize to real-world applications. Moreover, in addition to the audio\nmodeling, we propose to explicitly integrate facial encoding models into the\nexisting advanced Video MLLM, enabling the MLLM to effectively unify audio and\nthe subtle facial cues for emotion understanding. By aligning these features\nwithin a unified space and employing instruction tuning in our proposed\ndatasets, our Omni-Emotion achieves state-of-the-art performance in both\nemotion recognition and reasoning tasks.\n","authors":["Qize Yang","Detao Bai","Yi-Xing Peng","Xihan Wei"],"pdf_url":"https://arxiv.org/pdf/2501.09502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09499v1","updated":"2025-01-16T12:20:40Z","published":"2025-01-16T12:20:40Z","title":"VanGogh: A Unified Multimodal Diffusion-based Framework for Video\n Colorization","summary":" Video colorization aims to transform grayscale videos into vivid color\nrepresentations while maintaining temporal consistency and structural\nintegrity. Existing video colorization methods often suffer from color bleeding\nand lack comprehensive control, particularly under complex motion or diverse\nsemantic cues. To this end, we introduce VanGogh, a unified multimodal\ndiffusion-based framework for video colorization. VanGogh tackles these\nchallenges using a Dual Qformer to align and fuse features from multiple\nmodalities, complemented by a depth-guided generation process and an optical\nflow loss, which help reduce color overflow. Additionally, a color injection\nstrategy and luma channel replacement are implemented to improve generalization\nand mitigate flickering artifacts. Thanks to this design, users can exercise\nboth global and local control over the generation process, resulting in\nhigher-quality colorized videos. Extensive qualitative and quantitative\nevaluations, and user studies, demonstrate that VanGogh achieves superior\ntemporal consistency and color fidelity.Project page:\nhttps://becauseimbatman0.github.io/VanGogh.\n","authors":["Zixun Fang","Zhiheng Liu","Kai Zhu","Yu Liu","Ka Leong Cheng","Wei Zhai","Yang Cao","Zheng-Jun Zha"],"pdf_url":"https://arxiv.org/pdf/2501.09499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09377v3","updated":"2025-01-16T12:12:24Z","published":"2023-06-15T08:18:29Z","title":"Evaluating alignment between humans and neural network representations\n in image-based learning tasks","summary":" Humans represent scenes and objects in rich feature spaces, carrying\ninformation that allows us to generalise about category memberships and\nabstract functions with few examples. What determines whether a neural network\nmodel generalises like a human? We tested how well the representations of $86$\npretrained neural network models mapped to human learning trajectories across\ntwo tasks where humans had to learn continuous relationships and categories of\nnatural images. In these tasks, both human participants and neural networks\nsuccessfully identified the relevant stimulus features within a few trials,\ndemonstrating effective generalisation. We found that while training dataset\nsize was a core determinant of alignment with human choices, contrastive\ntraining with multi-modal data (text and imagery) was a common feature of\ncurrently publicly available models that predicted human generalisation.\nIntrinsic dimensionality of representations had different effects on alignment\nfor different model types. Lastly, we tested three sets of human-aligned\nrepresentations and found no consistent improvements in predictive accuracy\ncompared to the baselines. In conclusion, pretrained neural networks can serve\nto extract representations for cognitive models, as they appear to capture some\nfundamental aspects of cognition that are transferable across tasks. Both our\nparadigms and modelling approach offer a novel way to quantify alignment\nbetween neural networks and humans and extend cognitive science into more\nnaturalistic domains.\n","authors":["Can Demircan","Tankred Saanum","Leonardo Pettini","Marcel Binz","Blazej M Baczkowski","Christian F Doeller","Mona M Garvert","Eric Schulz"],"pdf_url":"https://arxiv.org/pdf/2306.09377v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08443v2","updated":"2025-01-16T12:06:35Z","published":"2024-12-26T05:41:31Z","title":"Instruction-Guided Fusion of Multi-Layer Visual Features in Large\n Vision-Language Models","summary":" Large Vision-Language Models (LVLMs) have achieved significant success in\nmultimodal tasks by combining pre-trained vision encoders and large language\nmodels. However, current LVLMs mainly rely on features from the final layers of\nthe vision encoder, neglecting complementary information in shallower layers.\nWhile recent methods have explored multi-layer features, they are often\ntask-agnostic. We investigate the contributions of visual features from\ndifferent encoder layers across 18 benchmarks and 6 task categories. Our\nresults show that multi-layer features provide complementary strengths with\nvarying task dependencies, and uniform fusion performs suboptimally. Based on\nthese findings, we propose an instruction-guided vision aggregator that\ndynamically integrates multi-layer features based on textual instructions,\nwithout increasing the number of visual tokens. Extensive evaluations show\nsuperior performance, and analysis reveals the dominance of mid-to-high-level\nfeatures in semantic tasks and the critical role of low-level features in\nfine-grained perception. This work provides valuable insights into the adaptive\nuse of hierarchical visual features in LVLMs, advancing more flexible\nmultimodal systems.\n","authors":["Xu Li","Yi Zheng","Haotian Chen","Xiaolei Chen","Yuxuan Liang","Chenghang Lai","Bin Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2501.08443v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09490v1","updated":"2025-01-16T12:01:44Z","published":"2025-01-16T12:01:44Z","title":"Comparison of Various SLAM Systems for Mobile Robot in an Indoor\n Environment","summary":" This article presents a comparative analysis of a mobile robot trajectories\ncomputed by various ROS-based SLAM systems. For this reason we developed a\nprototype of a mobile robot with common sensors: 2D lidar, a monocular and ZED\nstereo cameras. Then we conducted experiments in a typical office environment\nand collected data from all sensors, running all tested SLAM systems based on\nthe acquired dataset. We studied the following SLAM systems: (a) 2D\nlidar-based: GMapping, Hector SLAM, Cartographer; (b) monocular camera-based:\nLarge Scale Direct monocular SLAM (LSD SLAM), ORB SLAM, Direct Sparse Odometry\n(DSO); and (c) stereo camera-based: ZEDfu, Real-Time Appearance-Based Mapping\n(RTAB map), ORB SLAM, Stereo Parallel Tracking and Mapping (S-PTAM). Since all\nSLAM methods were tested on the same dataset we compared results for different\nSLAM systems with appropriate metrics, demonstrating encouraging results for\nlidar-based Cartographer SLAM, Monocular ORB SLAM and Stereo RTAB Map methods.\n","authors":["Maksim Filipenko","Ilya Afanasyev"],"pdf_url":"https://arxiv.org/pdf/2501.09490v1.pdf","comment":"6 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.09485v1","updated":"2025-01-16T11:44:29Z","published":"2025-01-16T11:44:29Z","title":"The Devil is in the Details: Simple Remedies for Image-to-LiDAR\n Representation Learning","summary":" LiDAR is a crucial sensor in autonomous driving, commonly used alongside\ncameras. By exploiting this camera-LiDAR setup and recent advances in image\nrepresentation learning, prior studies have shown the promising potential of\nimage-to-LiDAR distillation. These prior arts focus on the designs of their own\nlosses to effectively distill the pre-trained 2D image representations into a\n3D model. However, the other parts of the designs have been surprisingly\nunexplored. We find that fundamental design elements, e.g., the LiDAR\ncoordinate system, quantization according to the existing input interface, and\ndata utilization, are more critical than developing loss functions, which have\nbeen overlooked in prior works. In this work, we show that simple fixes to\nthese designs notably outperform existing methods by 16% in 3D semantic\nsegmentation on the nuScenes dataset and 13% in 3D object detection on the\nKITTI dataset in downstream task performance. We focus on overlooked design\nchoices along the spatial and temporal axes. Spatially, prior work has used\ncylindrical coordinate and voxel sizes without considering their side effects\nyielded with a commonly deployed sparse convolution layer input interface,\nleading to spatial quantization errors in 3D models. Temporally, existing work\nhas avoided cumbersome data curation by discarding unsynced data, limiting the\nuse to only the small portion of data that is temporally synced across sensors.\nWe analyze these effects and propose simple solutions for each overlooked\naspect.\n","authors":["Wonjun Jo","Kwon Byung-Ki","Kim Ji-Yeon","Hawook Jeong","Kyungdon Joo","Tae-Hyun Oh"],"pdf_url":"https://arxiv.org/pdf/2501.09485v1.pdf","comment":"Accepted to ACCV2024"},{"id":"http://arxiv.org/abs/2501.09481v1","updated":"2025-01-16T11:35:22Z","published":"2025-01-16T11:35:22Z","title":"MonoSOWA: Scalable monocular 3D Object detector Without human\n Annotations","summary":" Detecting the three-dimensional position and orientation of objects using a\nsingle RGB camera is a foundational task in computer vision with many important\napplications. Traditionally, 3D object detection methods are trained in a\nfully-supervised setup, requiring vast amounts of human annotations, which are\nlaborious, costly, and do not scale well with the ever-increasing amounts of\ndata being captured.\n In this paper, we present the first method to train 3D object detectors for\nmonocular RGB cameras without domain-specific human annotations, thus making\norders of magnitude more data available for training. Thanks to newly proposed\nCanonical Object Space, the method can not only exploit data across a variety\nof datasets and camera setups to train a single 3D detector, but unlike\nprevious work it also works out of the box in previously unseen camera setups.\nAll this is crucial for practical applications, where the data and cameras are\nextremely heterogeneous.\n The method is evaluated on two standard autonomous driving datasets, where it\noutperforms previous works, which, unlike our method, still rely on 2D human\nannotations.\n","authors":["Jan Skvrna","Lukas Neumann"],"pdf_url":"https://arxiv.org/pdf/2501.09481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.04747v6","updated":"2025-01-16T11:17:04Z","published":"2022-09-10T22:00:30Z","title":"Diffusion Models in Vision: A Survey","summary":" Denoising diffusion models represent a recent emerging topic in computer\nvision, demonstrating remarkable results in the area of generative modeling. A\ndiffusion model is a deep generative model that is based on two stages, a\nforward diffusion stage and a reverse diffusion stage. In the forward diffusion\nstage, the input data is gradually perturbed over several steps by adding\nGaussian noise. In the reverse stage, a model is tasked at recovering the\noriginal input data by learning to gradually reverse the diffusion process,\nstep by step. Diffusion models are widely appreciated for the quality and\ndiversity of the generated samples, despite their known computational burdens,\ni.e. low speeds due to the high number of steps involved during sampling. In\nthis survey, we provide a comprehensive review of articles on denoising\ndiffusion models applied in vision, comprising both theoretical and practical\ncontributions in the field. First, we identify and present three generic\ndiffusion modeling frameworks, which are based on denoising diffusion\nprobabilistic models, noise conditioned score networks, and stochastic\ndifferential equations. We further discuss the relations between diffusion\nmodels and other deep generative models, including variational auto-encoders,\ngenerative adversarial networks, energy-based models, autoregressive models and\nnormalizing flows. Then, we introduce a multi-perspective categorization of\ndiffusion models applied in computer vision. Finally, we illustrate the current\nlimitations of diffusion models and envision some interesting directions for\nfuture research.\n","authors":["Florinel-Alin Croitoru","Vlad Hondru","Radu Tudor Ionescu","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2209.04747v6.pdf","comment":"Accepted in IEEE Transactions on Pattern Analysis and Machine\n Intelligence. 25 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.09466v1","updated":"2025-01-16T10:59:29Z","published":"2025-01-16T10:59:29Z","title":"DEFOM-Stereo: Depth Foundation Model Based Stereo Matching","summary":" Stereo matching is a key technique for metric depth estimation in computer\nvision and robotics. Real-world challenges like occlusion and non-texture\nhinder accurate disparity estimation from binocular matching cues. Recently,\nmonocular relative depth estimation has shown remarkable generalization using\nvision foundation models. Thus, to facilitate robust stereo matching with\nmonocular depth cues, we incorporate a robust monocular relative depth model\ninto the recurrent stereo-matching framework, building a new framework for\ndepth foundation model-based stereo-matching, DEFOM-Stereo. In the feature\nextraction stage, we construct the combined context and matching feature\nencoder by integrating features from conventional CNNs and DEFOM. In the update\nstage, we use the depth predicted by DEFOM to initialize the recurrent\ndisparity and introduce a scale update module to refine the disparity at the\ncorrect scale. DEFOM-Stereo is verified to have comparable performance on the\nScene Flow dataset with state-of-the-art (SOTA) methods and notably shows much\nstronger zero-shot generalization. Moreover, DEFOM-Stereo achieves SOTA\nperformance on the KITTI 2012, KITTI 2015, Middlebury, and ETH3D benchmarks,\nranking 1st on many metrics. In the joint evaluation under the robust vision\nchallenge, our model simultaneously outperforms previous models on the\nindividual benchmarks. Both results demonstrate the outstanding capabilities of\nthe proposed model.\n","authors":["Hualie Jiang","Zhiqiang Lou","Laiyan Ding","Rui Xu","Minglang Tan","Wenjie Jiang","Rui Huang"],"pdf_url":"https://arxiv.org/pdf/2501.09466v1.pdf","comment":"Code: https://github.com/Insta360-Research-Team/DEFOM-Stereo"},{"id":"http://arxiv.org/abs/2312.14150v3","updated":"2025-01-16T10:57:44Z","published":"2023-12-21T18:59:12Z","title":"DriveLM: Driving with Graph Visual Question Answering","summary":" We study how vision-language models (VLMs) trained on web-scale data can be\nintegrated into end-to-end driving systems to boost generalization and enable\ninteractivity with human users. While recent approaches adapt VLMs to driving\nvia single-round visual question answering (VQA), human drivers reason about\ndecisions in multiple steps. Starting from the localization of key objects,\nhumans estimate object interactions before taking actions. The key insight is\nthat with our proposed task, Graph VQA, where we model graph-structured\nreasoning through perception, prediction and planning question-answer pairs, we\nobtain a suitable proxy task to mimic the human reasoning process. We\ninstantiate datasets (DriveLM-Data) built upon nuScenes and CARLA, and propose\na VLM-based baseline approach (DriveLM-Agent) for jointly performing Graph VQA\nand end-to-end driving. The experiments demonstrate that Graph VQA provides a\nsimple, principled framework for reasoning about a driving scene, and\nDriveLM-Data provides a challenging benchmark for this task. Our DriveLM-Agent\nbaseline performs end-to-end autonomous driving competitively in comparison to\nstate-of-the-art driving-specific architectures. Notably, its benefits are\npronounced when it is evaluated zero-shot on unseen objects or sensor\nconfigurations. We hope this work can be the starting point to shed new light\non how to apply VLMs for autonomous driving. To facilitate future research, all\ncode, data, and models are available to the public.\n","authors":["Chonghao Sima","Katrin Renz","Kashyap Chitta","Li Chen","Hanxue Zhang","Chengen Xie","Jens Beißwenger","Ping Luo","Andreas Geiger","Hongyang Li"],"pdf_url":"https://arxiv.org/pdf/2312.14150v3.pdf","comment":"Accepted to ECCV 2024 as Oral paper"},{"id":"http://arxiv.org/abs/2501.09465v1","updated":"2025-01-16T10:56:45Z","published":"2025-01-16T10:56:45Z","title":"RE-POSE: Synergizing Reinforcement Learning-Based Partitioning and\n Offloading for Edge Object Detection","summary":" Object detection plays a crucial role in smart video analysis, with\napplications ranging from autonomous driving and security to smart cities.\nHowever, achieving real-time object detection on edge devices presents\nsignificant challenges due to their limited computational resources and the\nhigh demands of deep neural network (DNN)-based detection models, particularly\nwhen processing high-resolution video. Conventional strategies, such as input\ndown-sampling and network up-scaling, often compromise detection accuracy for\nfaster performance or lead to higher inference latency. To address these\nissues, this paper introduces RE-POSE, a Reinforcement Learning (RL)-Driven\nPartitioning and Edge Offloading framework designed to optimize the\naccuracy-latency trade-off in resource-constrained edge environments. Our\napproach features an RL-Based Dynamic Clustering Algorithm (RL-DCA) that\npartitions video frames into non-uniform blocks based on object distribution\nand the computational characteristics of DNNs. Furthermore, a parallel edge\noffloading scheme is implemented to distribute these blocks across multiple\nedge servers for concurrent processing. Experimental evaluations show that\nRE-POSE significantly enhances detection accuracy and reduces inference\nlatency, surpassing existing methods.\n","authors":["Jianrui Shi","Yong Zhao","Zeyang Cui","Xiaoming Shen","Minhang Zeng","Xiaojie Liu"],"pdf_url":"https://arxiv.org/pdf/2501.09465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08258v2","updated":"2025-01-16T10:55:41Z","published":"2025-01-14T17:10:02Z","title":"Towards an End-to-End (E2E) Adversarial Learning and Application in the\n Physical World","summary":" The traditional learning process of patch-based adversarial attacks,\nconducted in the digital domain and then applied in the physical domain (e.g.,\nvia printed stickers), may suffer from reduced performance due to adversarial\npatches' limited transferability from the digital domain to the physical\ndomain. Given that previous studies have considered using projectors to apply\nadversarial attacks, we raise the following question: can adversarial learning\n(i.e., patch generation) be performed entirely in the physical domain with a\nprojector? In this work, we propose the Physical-domain Adversarial Patch\nLearning Augmentation (PAPLA) framework, a novel end-to-end (E2E) framework\nthat converts adversarial learning from the digital domain to the physical\ndomain using a projector. We evaluate PAPLA across multiple scenarios,\nincluding controlled laboratory settings and realistic outdoor environments,\ndemonstrating its ability to ensure attack success compared to conventional\ndigital learning-physical application (DL-PA) methods. We also analyze the\nimpact of environmental factors, such as projection surface color, projector\nstrength, ambient light, distance, and angle of the target object relative to\nthe camera, on the effectiveness of projected patches. Finally, we demonstrate\nthe feasibility of the attack against a parked car and a stop sign in a\nreal-world outdoor environment. Our results show that under specific\nconditions, E2E adversarial learning in the physical domain eliminates the\ntransferability issue and ensures evasion by object detectors. Finally, we\nprovide insights into the challenges and opportunities of applying adversarial\nlearning in the physical domain and explain where such an approach is more\neffective than using a sticker.\n","authors":["Dudi Biton","Jacob Shams","Satoru Koda","Asaf Shabtai","Yuval Elovici","Ben Nassi"],"pdf_url":"https://arxiv.org/pdf/2501.08258v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09460v1","updated":"2025-01-16T10:42:29Z","published":"2025-01-16T10:42:29Z","title":"Normal-NeRF: Ambiguity-Robust Normal Estimation for Highly Reflective\n Scenes","summary":" Neural Radiance Fields (NeRF) often struggle with reconstructing and\nrendering highly reflective scenes. Recent advancements have developed various\nreflection-aware appearance models to enhance NeRF's capability to render\nspecular reflections. However, the robust reconstruction of highly reflective\nscenes is still hindered by the inherent shape ambiguity on specular surfaces.\nExisting methods typically rely on additional geometry priors to regularize the\nshape prediction, but this can lead to oversmoothed geometry in complex scenes.\nObserving the critical role of surface normals in parameterizing reflections,\nwe introduce a transmittance-gradient-based normal estimation technique that\nremains robust even under ambiguous shape conditions. Furthermore, we propose a\ndual activated densities module that effectively bridges the gap between smooth\nsurface normals and sharp object boundaries. Combined with a reflection-aware\nappearance model, our proposed method achieves robust reconstruction and\nhigh-fidelity rendering of scenes featuring both highly specular reflections\nand intricate geometric structures. Extensive experiments demonstrate that our\nmethod outperforms existing state-of-the-art methods on various datasets.\n","authors":["Ji Shi","Xianghua Ying","Ruohao Guo","Bowei Xing","Wenzhen Yue"],"pdf_url":"https://arxiv.org/pdf/2501.09460v1.pdf","comment":"AAAI 2025, code available at https://github.com/sjj118/Normal-NeRF"},{"id":"http://arxiv.org/abs/2501.09456v1","updated":"2025-01-16T10:31:51Z","published":"2025-01-16T10:31:51Z","title":"On the Relation between Optical Aperture and Automotive Object Detection","summary":" We explore the impact of aperture size and shape on automotive camera systems\nfor deep-learning-based tasks like traffic sign recognition and light state\ndetection. A method is proposed to simulate optical effects using the point\nspread function (PSF), enhancing realism and reducing the domain gap between\nsynthetic and real-world images. Computer-generated scenes are refined with\nthis technique to model optical distortions and improve simulation accuracy.\n","authors":["Ofer Bar-Shalom","Tzvi Philipp","Eran Kishon"],"pdf_url":"https://arxiv.org/pdf/2501.09456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09446v1","updated":"2025-01-16T10:20:48Z","published":"2025-01-16T10:20:48Z","title":"Double Visual Defense: Adversarial Pre-training and Instruction Tuning\n for Improving Vision-Language Model Robustness","summary":" This paper investigates the robustness of vision-language models against\nadversarial visual perturbations and introduces a novel ``double visual\ndefense\" to enhance this robustness. Unlike previous approaches that resort to\nlightweight adversarial fine-tuning of a pre-trained CLIP model, we perform\nlarge-scale adversarial vision-language pre-training from scratch using\nweb-scale data. We then strengthen the defense by incorporating adversarial\nvisual instruction tuning. The resulting models from each stage, $\\Delta$CLIP\nand $\\Delta^2$LLaVA, show substantially enhanced zero-shot robustness and set a\nnew state-of-the-art in adversarial defense for vision-language models. For\nexample, the adversarial robustness of $\\Delta$CLIP surpasses that of the\nprevious best models on ImageNet-1k by ~20%. %For example, $\\Delta$CLIP\nsurpasses the previous best models on ImageNet-1k by ~20% in terms of\nadversarial robustness. Similarly, compared to prior art, $\\Delta^2$LLaVA\nbrings a ~30% robustness improvement to image captioning task and a ~20%\nrobustness improvement to visual question answering task. Furthermore, our\nmodels exhibit stronger zero-shot recognition capability, fewer hallucinations,\nand superior reasoning performance compared to baselines. Our project page is\nhttps://doublevisualdefense.github.io/.\n","authors":["Zeyu Wang","Cihang Xie","Brian Bartoldson","Bhavya Kailkhura"],"pdf_url":"https://arxiv.org/pdf/2501.09446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15500v4","updated":"2025-01-16T10:20:32Z","published":"2024-07-22T09:31:30Z","title":"TextureCrop: Enhancing Synthetic Image Detection through Texture-based\n Cropping","summary":" Generative AI technologies produce increasingly realistic imagery, which,\ndespite its potential for creative applications, can also be misused to produce\nmisleading and harmful content. This renders Synthetic Image Detection (SID)\nmethods essential for identifying AI-generated content online. State-of-the-art\nSID methods typically resize or center-crop input images due to architectural\nor computational constraints, which hampers the detection of artifacts that\nappear in high-resolution images. To address this limitation, we propose\nTextureCrop, an image pre-processing component that can be plugged in any\npre-trained SID model to improve its performance. By focusing on high-frequency\nimage parts where generative artifacts are prevalent, TextureCrop enhances SID\nperformance with manageable memory requirements. Experimental results\ndemonstrate a consistent improvement in AUC across various detectors by 6.1%\ncompared to center cropping and by 15% compared to resizing, across\nhigh-resolution images from the Forensynths, Synthbuster and TWIGMA datasets.\nCode available at https : //github.com/mever-team/texture-crop.\n","authors":["Despina Konstantinidou","Christos Koutlis","Symeon Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2407.15500v4.pdf","comment":"10 pages, 7 images"},{"id":"http://arxiv.org/abs/2501.09436v1","updated":"2025-01-16T10:07:44Z","published":"2025-01-16T10:07:44Z","title":"Scaling up self-supervised learning for improved surgical foundation\n models","summary":" Foundation models have revolutionized computer vision by achieving vastly\nsuperior performance across diverse tasks through large-scale pretraining on\nextensive datasets. However, their application in surgical computer vision has\nbeen limited. This study addresses this gap by introducing SurgeNetXL, a novel\nsurgical foundation model that sets a new benchmark in surgical computer\nvision. Trained on the largest reported surgical dataset to date, comprising\nover 4.7 million video frames, SurgeNetXL achieves consistent top-tier\nperformance across six datasets spanning four surgical procedures and three\ntasks, including semantic segmentation, phase recognition, and critical view of\nsafety (CVS) classification. Compared with the best-performing surgical\nfoundation models, SurgeNetXL shows mean improvements of 2.4, 9.0, and 12.6\npercent for semantic segmentation, phase recognition, and CVS classification,\nrespectively. Additionally, SurgeNetXL outperforms the best-performing\nImageNet-based variants by 14.4, 4.0, and 1.6 percent in the respective tasks.\nIn addition to advancing model performance, this study provides key insights\ninto scaling pretraining datasets, extending training durations, and optimizing\nmodel architectures specifically for surgical computer vision. These findings\npave the way for improved generalizability and robustness in data-scarce\nscenarios, offering a comprehensive framework for future research in this\ndomain. All models and a subset of the SurgeNetXL dataset, including over 2\nmillion video frames, are publicly available at:\nhttps://github.com/TimJaspers0801/SurgeNet.\n","authors":["Tim J. M. Jaspers","Ronald L. P. D. de Jong","Yiping Li","Carolus H. J. Kusters","Franciscus H. A. Bakker","Romy C. van Jaarsveld","Gino M. Kuiper","Richard van Hillegersberg","Jelle P. Ruurda","Willem M. Brinkman","Josien P. W. Pluim","Peter H. N. de With","Marcel Breeuwer","Yasmina Al Khalil","Fons van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2501.09436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09433v1","updated":"2025-01-16T10:03:15Z","published":"2025-01-16T10:03:15Z","title":"CaPa: Carve-n-Paint Synthesis for Efficient 4K Textured Mesh Generation","summary":" The synthesis of high-quality 3D assets from textual or visual inputs has\nbecome a central objective in modern generative modeling. Despite the\nproliferation of 3D generation algorithms, they frequently grapple with\nchallenges such as multi-view inconsistency, slow generation times, low\nfidelity, and surface reconstruction problems. While some studies have\naddressed some of these issues, a comprehensive solution remains elusive. In\nthis paper, we introduce \\textbf{CaPa}, a carve-and-paint framework that\ngenerates high-fidelity 3D assets efficiently. CaPa employs a two-stage\nprocess, decoupling geometry generation from texture synthesis. Initially, a 3D\nlatent diffusion model generates geometry guided by multi-view inputs, ensuring\nstructural consistency across perspectives. Subsequently, leveraging a novel,\nmodel-agnostic Spatially Decoupled Attention, the framework synthesizes\nhigh-resolution textures (up to 4K) for a given geometry. Furthermore, we\npropose a 3D-aware occlusion inpainting algorithm that fills untextured\nregions, resulting in cohesive results across the entire model. This pipeline\ngenerates high-quality 3D assets in less than 30 seconds, providing\nready-to-use outputs for commercial applications. Experimental results\ndemonstrate that CaPa excels in both texture fidelity and geometric stability,\nestablishing a new standard for practical, scalable 3D asset generation.\n","authors":["Hwan Heo","Jangyeong Kim","Seongyeong Lee","Jeong A Wi","Junyoung Choi","Sangjun Ahn"],"pdf_url":"https://arxiv.org/pdf/2501.09433v1.pdf","comment":"project page: https://ncsoft.github.io/CaPa/"},{"id":"http://arxiv.org/abs/2501.09428v1","updated":"2025-01-16T09:57:40Z","published":"2025-01-16T09:57:40Z","title":"AugRefer: Advancing 3D Visual Grounding via Cross-Modal Augmentation and\n Spatial Relation-based Referring","summary":" 3D visual grounding (3DVG), which aims to correlate a natural language\ndescription with the target object within a 3D scene, is a significant yet\nchallenging task. Despite recent advancements in this domain, existing\napproaches commonly encounter a shortage: a limited amount and diversity of\ntext3D pairs available for training. Moreover, they fall short in effectively\nleveraging different contextual clues (e.g., rich spatial relations within the\n3D visual space) for grounding. To address these limitations, we propose\nAugRefer, a novel approach for advancing 3D visual grounding. AugRefer\nintroduces cross-modal augmentation designed to extensively generate diverse\ntext-3D pairs by placing objects into 3D scenes and creating accurate and\nsemantically rich descriptions using foundation models. Notably, the resulting\npairs can be utilized by any existing 3DVG methods for enriching their training\ndata. Additionally, AugRefer presents a language-spatial adaptive decoder that\neffectively adapts the potential referring objects based on the language\ndescription and various 3D spatial relations. Extensive experiments on three\nbenchmark datasets clearly validate the effectiveness of AugRefer.\n","authors":["Xinyi Wang","Na Zhao","Zhiyuan Han","Dan Guo","Xun Yang"],"pdf_url":"https://arxiv.org/pdf/2501.09428v1.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2501.09425v1","updated":"2025-01-16T09:55:42Z","published":"2025-01-16T09:55:42Z","title":"Vision-Language Models Do Not Understand Negation","summary":" Many practical vision-language applications require models that understand\nnegation, e.g., when using natural language to retrieve images which contain\ncertain objects but not others. Despite advancements in vision-language models\n(VLMs) through large-scale training, their ability to comprehend negation\nremains underexplored. This study addresses the question: how well do current\nVLMs understand negation? We introduce NegBench, a new benchmark designed to\nevaluate negation understanding across 18 task variations and 79k examples\nspanning image, video, and medical datasets. The benchmark consists of two core\ntasks designed to evaluate negation understanding in diverse multimodal\nsettings: Retrieval with Negation and Multiple Choice Questions with Negated\nCaptions. Our evaluation reveals that modern VLMs struggle significantly with\nnegation, often performing at chance level. To address these shortcomings, we\nexplore a data-centric approach wherein we finetune CLIP models on large-scale\nsynthetic datasets containing millions of negated captions. We show that this\napproach can result in a 10% increase in recall on negated queries and a 40%\nboost in accuracy on multiple-choice questions with negated captions.\n","authors":["Kumail Alhamoud","Shaden Alshammari","Yonglong Tian","Guohao Li","Philip Torr","Yoon Kim","Marzyeh Ghassemi"],"pdf_url":"https://arxiv.org/pdf/2501.09425v1.pdf","comment":"Project page: https://negbench.github.io"},{"id":"http://arxiv.org/abs/2501.09420v1","updated":"2025-01-16T09:47:18Z","published":"2025-01-16T09:47:18Z","title":"Dynamic Neural Style Transfer for Artistic Image Generation using VGG19","summary":" Throughout history, humans have created remarkable works of art, but\nartificial intelligence has only recently started to make strides in generating\nvisually compelling art. Breakthroughs in the past few years have focused on\nusing convolutional neural networks (CNNs) to separate and manipulate the\ncontent and style of images, applying texture synthesis techniques.\nNevertheless, a number of current techniques continue to encounter obstacles,\nincluding lengthy processing times, restricted choices of style images, and the\ninability to modify the weight ratio of styles. We proposed a neural style\ntransfer system that can add various artistic styles to a desired image to\naddress these constraints allowing flexible adjustments to style weight ratios\nand reducing processing time. The system uses the VGG19 model for feature\nextraction, ensuring high-quality, flexible stylization without compromising\ncontent integrity.\n","authors":["Kapil Kashyap","Mehak Garg","Sean Fargose","Sindhu Nair"],"pdf_url":"https://arxiv.org/pdf/2501.09420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09411v1","updated":"2025-01-16T09:38:22Z","published":"2025-01-16T09:38:22Z","title":"Towards Robust and Realistic Human Pose Estimation via WiFi Signals","summary":" Robust WiFi-based human pose estimation is a challenging task that bridges\ndiscrete and subtle WiFi signals to human skeletons. This paper revisits this\nproblem and reveals two critical yet overlooked issues: 1) cross-domain gap,\ni.e., due to significant variations between source-target domain pose\ndistributions; and 2) structural fidelity gap, i.e., predicted skeletal poses\nmanifest distorted topology, usually with misplaced joints and disproportionate\nbone lengths. This paper fills these gaps by reformulating the task into a\nnovel two-phase framework dubbed DT-Pose: Domain-consistent representation\nlearning and Topology-constrained Pose decoding. Concretely, we first propose a\ntemporal-consistent contrastive learning strategy with uniformity\nregularization, coupled with self-supervised masking-reconstruction operations,\nto enable robust learning of domain-consistent and motion-discriminative\nWiFi-specific representations. Beyond this, we introduce a simple yet effective\npose decoder with task prompts, which integrates Graph Convolution Network\n(GCN) and Transformer layers to constrain the topology structure of the\ngenerated skeleton by exploring the adjacent-overarching relationships among\nhuman joints. Extensive experiments conducted on various benchmark datasets\nhighlight the superior performance of our method in tackling these fundamental\nchallenges in both 2D/3D human pose estimation tasks.\n","authors":["Yang Chen","Jingcai Guo","Song Guo","Jingren Zhou","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2501.09411v1.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2406.04829v3","updated":"2025-01-16T09:31:01Z","published":"2024-06-07T10:54:40Z","title":"IOR: Inversed Objects Replay for Incremental Object Detection","summary":" Existing Incremental Object Detection (IOD) methods partially alleviate\ncatastrophic forgetting when incrementally detecting new objects in real-world\nscenarios. However, many of these methods rely on the assumption that unlabeled\nold-class objects may co-occur with labeled new-class objects in the\nincremental data. When unlabeled old-class objects are absent, the performance\nof existing methods tends to degrade. The absence can be mitigated by\ngenerating old-class samples, but it incurs high costs. This paper argues that\nprevious generation-based IOD suffers from redundancy, both in the use of\ngenerative models, which require additional training and storage, and in the\noverproduction of generated samples, many of which do not contribute\nsignificantly to performance improvements. To eliminate the redundancy, we\npropose Inversed Objects Replay (IOR). Specifically, we generate old-class\nsamples by inversing the original detectors, thus eliminating the necessity of\ntraining and storing additional generative models. We propose augmented replay\nto reuse the objects in generated samples, reducing redundant generations.\nMoreover, we propose high-value knowledge distillation focusing on the\npositions of old-class objects overwhelmed by the background, which transfers\nthe knowledge to the incremental detector. Extensive experiments conducted on\nMS COCO 2017 demonstrate that our method can efficiently improve detection\nperformance in IOD scenarios with the absence of old-class objects.\n","authors":["Zijia An","Boyu Diao","Libo Huang","Ruiqi Liu","Zhulin An","Yongjun Xu"],"pdf_url":"https://arxiv.org/pdf/2406.04829v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09403v1","updated":"2025-01-16T09:18:59Z","published":"2025-01-16T09:18:59Z","title":"PISCO: Self-Supervised k-Space Regularization for Improved Neural\n Implicit k-Space Representations of Dynamic MRI","summary":" Neural implicit k-space representations (NIK) have shown promising results\nfor dynamic magnetic resonance imaging (MRI) at high temporal resolutions. Yet,\nreducing acquisition time, and thereby available training data, results in\nsevere performance drops due to overfitting. To address this, we introduce a\nnovel self-supervised k-space loss function $\\mathcal{L}_\\mathrm{PISCO}$,\napplicable for regularization of NIK-based reconstructions. The proposed loss\nfunction is based on the concept of parallel imaging-inspired self-consistency\n(PISCO), enforcing a consistent global k-space neighborhood relationship\nwithout requiring additional data. Quantitative and qualitative evaluations on\nstatic and dynamic MR reconstructions show that integrating PISCO significantly\nimproves NIK representations. Particularly for high acceleration factors\n(R$\\geq$54), NIK with PISCO achieves superior spatio-temporal reconstruction\nquality compared to state-of-the-art methods. Furthermore, an extensive\nanalysis of the loss assumptions and stability shows PISCO's potential as\nversatile self-supervised k-space loss function for further applications and\narchitectures. Code is available at:\nhttps://github.com/compai-lab/2025-pisco-spieker\n","authors":["Veronika Spieker","Hannah Eichhorn","Wenqi Huang","Jonathan K. Stelter","Tabita Catalan","Rickmer F. Braren","Daniel Rueckert","Francisco Sahli Costabal","Kerstin Hammernik","Dimitrios C. Karampinos","Claudia Prieto","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2501.09403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09396v1","updated":"2025-01-16T09:07:01Z","published":"2025-01-16T09:07:01Z","title":"Joint Transmission and Deblurring: A Semantic Communication Approach\n Using Events","summary":" Deep learning-based joint source-channel coding (JSCC) is emerging as a\npromising technology for effective image transmission. However, most existing\napproaches focus on transmitting clear images, overlooking real-world\nchallenges such as motion blur caused by camera shaking or fast-moving objects.\nMotion blur often degrades image quality, making transmission and\nreconstruction more challenging. Event cameras, which asynchronously record\npixel intensity changes with extremely low latency, have shown great potential\nfor motion deblurring tasks. However, the efficient transmission of the\nabundant data generated by event cameras remains a significant challenge. In\nthis work, we propose a novel JSCC framework for the joint transmission of\nblurry images and events, aimed at achieving high-quality reconstructions under\nlimited channel bandwidth. This approach is designed as a deblurring\ntask-oriented JSCC system. Since RGB cameras and event cameras capture the same\nscene through different modalities, their outputs contain both shared and\ndomain-specific information. To avoid repeatedly transmitting the shared\ninformation, we extract and transmit their shared information and\ndomain-specific information, respectively. At the receiver, the received\nsignals are processed by a deblurring decoder to generate clear images.\nAdditionally, we introduce a multi-stage training strategy to train the\nproposed model. Simulation results demonstrate that our method significantly\noutperforms existing JSCC-based image transmission schemes, addressing motion\nblur effectively.\n","authors":["Pujing Yang","Guangyi Zhang","Yunlong Cai","Lei Yu","Guanding Yu"],"pdf_url":"https://arxiv.org/pdf/2501.09396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09393v1","updated":"2025-01-16T09:05:46Z","published":"2025-01-16T09:05:46Z","title":"SVIA: A Street View Image Anonymization Framework for Self-Driving\n Applications","summary":" In recent years, there has been an increasing interest in image\nanonymization, particularly focusing on the de-identification of faces and\nindividuals. However, for self-driving applications, merely de-identifying\nfaces and individuals might not provide sufficient privacy protection since\nstreet views like vehicles and buildings can still disclose locations,\ntrajectories, and other sensitive information. Therefore, it remains crucial to\nextend anonymization techniques to street view images to fully preserve the\nprivacy of users, pedestrians, and vehicles. In this paper, we propose a Street\nView Image Anonymization (SVIA) framework for self-driving applications. The\nSVIA framework consists of three integral components: a semantic segmenter to\nsegment an input image into functional regions, an inpainter to generate\nalternatives to privacy-sensitive regions, and a harmonizer to seamlessly\nstitch modified regions to guarantee visual coherence. Compared to existing\nmethods, SVIA achieves a much better trade-off between image generation quality\nand privacy protection, as evidenced by experimental results for five common\nmetrics on two widely used public datasets.\n","authors":["Dongyu Liu","Xuhong Wang","Cen Chen","Yanhao Wang","Shengyue Yao","Yilun Lin"],"pdf_url":"https://arxiv.org/pdf/2501.09393v1.pdf","comment":"8 pages, 6 figures, 3 tables. Accepted by IEEE ITSC 2024"},{"id":"http://arxiv.org/abs/2410.20986v2","updated":"2025-01-16T08:58:44Z","published":"2024-10-28T13:04:44Z","title":"Skinned Motion Retargeting with Dense Geometric Interaction Perception","summary":" Capturing and maintaining geometric interactions among different body parts\nis crucial for successful motion retargeting in skinned characters. Existing\napproaches often overlook body geometries or add a geometry correction stage\nafter skeletal motion retargeting. This results in conflicts between skeleton\ninteraction and geometry correction, leading to issues such as jittery,\ninterpenetration, and contact mismatches. To address these challenges, we\nintroduce a new retargeting framework, MeshRet, which directly models the dense\ngeometric interactions in motion retargeting. Initially, we establish dense\nmesh correspondences between characters using semantically consistent sensors\n(SCS), effective across diverse mesh topologies. Subsequently, we develop a\nnovel spatio-temporal representation called the dense mesh interaction (DMI)\nfield. This field, a collection of interacting SCS feature vectors, skillfully\ncaptures both contact and non-contact interactions between body geometries. By\naligning the DMI field during retargeting, MeshRet not only preserves motion\nsemantics but also prevents self-interpenetration and ensures contact\npreservation. Extensive experiments on the public Mixamo dataset and our\nnewly-collected ScanRet dataset demonstrate that MeshRet achieves\nstate-of-the-art performance. Code available at\nhttps://github.com/abcyzj/MeshRet.\n","authors":["Zijie Ye","Jia-Wei Liu","Jia Jia","Shikun Sun","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2410.20986v2.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2407.03653v3","updated":"2025-01-16T08:55:49Z","published":"2024-07-04T05:48:28Z","title":"reBEN: Refined BigEarthNet Dataset for Remote Sensing Image Analysis","summary":" This paper presents refined BigEarthNet (reBEN) that is a large-scale,\nmulti-modal remote sensing dataset constructed to support deep learning (DL)\nstudies for remote sensing image analysis. The reBEN dataset consists of\n549,488 pairs of Sentinel-1 and Sentinel-2 image patches. To construct reBEN,\nwe initially consider the Sentinel-1 and Sentinel-2 tiles used to construct the\nBigEarthNet dataset and then divide them into patches of size 1200 m x 1200 m.\nWe apply atmospheric correction to the Sentinel-2 patches using the latest\nversion of the sen2cor tool, resulting in higher-quality patches compared to\nthose present in BigEarthNet. Each patch is then associated with a pixel-level\nreference map and scene-level multi-labels. This makes reBEN suitable for\npixel- and scene-based learning tasks. The labels are derived from the most\nrecent CORINE Land Cover (CLC) map of 2018 by utilizing the 19-class\nnomenclature as in BigEarthNet. The use of the most recent CLC map results in\novercoming the label noise present in BigEarthNet. Furthermore, we introduce a\nnew geographical-based split assignment algorithm that significantly reduces\nthe spatial correlation among the train, validation, and test sets with respect\nto those present in BigEarthNet. This increases the reliability of the\nevaluation of DL models. To minimize the DL model training time, we introduce\nsoftware tools that convert the reBEN dataset into a DL-optimized data format.\nIn our experiments, we show the potential of reBEN for multi-modal multi-label\nimage classification problems by considering several state-of-the-art DL\nmodels. The pre-trained model weights, associated code, and complete dataset\nare available at https://bigearth.net.\n","authors":["Kai Norman Clasen","Leonard Hackel","Tom Burgert","Gencer Sumbul","Begüm Demir","Volker Markl"],"pdf_url":"https://arxiv.org/pdf/2407.03653v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09372v1","updated":"2025-01-16T08:34:39Z","published":"2025-01-16T08:34:39Z","title":"Image Segmentation with transformers: An Overview, Challenges and Future","summary":" Image segmentation, a key task in computer vision, has traditionally relied\non convolutional neural networks (CNNs), yet these models struggle with\ncapturing complex spatial dependencies, objects with varying scales, need for\nmanually crafted architecture components and contextual information. This paper\nexplores the shortcomings of CNN-based models and the shift towards transformer\narchitectures -to overcome those limitations. This work reviews\nstate-of-the-art transformer-based segmentation models, addressing\nsegmentation-specific challenges and their solutions. The paper discusses\ncurrent challenges in transformer-based segmentation and outlines promising\nfuture trends, such as lightweight architectures and enhanced data efficiency.\nThis survey serves as a guide for understanding the impact of transformers in\nadvancing segmentation capabilities and overcoming the limitations of\ntraditional models.\n","authors":["Deepjyoti Chetia","Debasish Dutta","Sanjib Kr Kalita"],"pdf_url":"https://arxiv.org/pdf/2501.09372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.03659v3","updated":"2025-01-16T08:20:15Z","published":"2025-01-07T09:47:46Z","title":"DehazeGS: Seeing Through Fog with 3D Gaussian Splatting","summary":" Current novel view synthesis tasks primarily rely on high-quality and clear\nimages. However, in foggy scenes, scattering and attenuation can significantly\ndegrade the reconstruction and rendering quality. Although NeRF-based dehazing\nreconstruction algorithms have been developed, their use of deep fully\nconnected neural networks and per-ray sampling strategies leads to high\ncomputational costs. Moreover, NeRF's implicit representation struggles to\nrecover fine details from hazy scenes. In contrast, recent advancements in 3D\nGaussian Splatting achieve high-quality 3D scene reconstruction by explicitly\nmodeling point clouds into 3D Gaussians. In this paper, we propose leveraging\nthe explicit Gaussian representation to explain the foggy image formation\nprocess through a physically accurate forward rendering process. We introduce\nDehazeGS, a method capable of decomposing and rendering a fog-free background\nfrom participating media using only muti-view foggy images as input. We model\nthe transmission within each Gaussian distribution to simulate the formation of\nfog. During this process, we jointly learn the atmospheric light and scattering\ncoefficient while optimizing the Gaussian representation of the hazy scene. In\nthe inference stage, we eliminate the effects of scattering and attenuation on\nthe Gaussians and directly project them onto a 2D plane to obtain a clear view.\nExperiments on both synthetic and real-world foggy datasets demonstrate that\nDehazeGS achieves state-of-the-art performance in terms of both rendering\nquality and computational efficiency. visualizations are available at\nhttps://dehazegs.github.io/\n","authors":["Jinze Yu","Yiqun Wang","Zhengda Lu","Jianwei Guo","Yong Li","Hongxing Qin","Xiaopeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.03659v3.pdf","comment":"9 pages,4 figures"},{"id":"http://arxiv.org/abs/2501.05777v2","updated":"2025-01-16T08:20:11Z","published":"2025-01-10T08:18:37Z","title":"StructSR: Refuse Spurious Details in Real-World Image Super-Resolution","summary":" Diffusion-based models have shown great promise in real-world image\nsuper-resolution (Real-ISR), but often generate content with structural errors\nand spurious texture details due to the empirical priors and illusions of these\nmodels. To address this issue, we introduce StructSR, a simple, effective, and\nplug-and-play method that enhances structural fidelity and suppresses spurious\ndetails for diffusion-based Real-ISR. StructSR operates without the need for\nadditional fine-tuning, external model priors, or high-level semantic\nknowledge. At its core is the Structure-Aware Screening (SAS) mechanism, which\nidentifies the image with the highest structural similarity to the\nlow-resolution (LR) input in the early inference stage, allowing us to leverage\nit as a historical structure knowledge to suppress the generation of spurious\ndetails. By intervening in the diffusion inference process, StructSR seamlessly\nintegrates with existing diffusion-based Real-ISR models. Our experimental\nresults demonstrate that StructSR significantly improves the fidelity of\nstructure and texture, improving the PSNR and SSIM metrics by an average of\n5.27% and 9.36% on a synthetic dataset (DIV2K-Val) and 4.13% and 8.64% on two\nreal-world datasets (RealSR and DRealSR) when integrated with four\nstate-of-the-art diffusion-based Real-ISR methods.\n","authors":["Yachao Li","Dong Liang","Tianyu Ding","Sheng-Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2501.05777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09363v1","updated":"2025-01-16T08:18:03Z","published":"2025-01-16T08:18:03Z","title":"Identification of Traditional Medicinal Plant Leaves Using an effective\n Deep Learning model and Self-Curated Dataset","summary":" Medicinal plants have been a key component in producing traditional and\nmodern medicines, especially in the field of Ayurveda, an ancient Indian\nmedical system. Producing these medicines and collecting and extracting the\nright plant is a crucial step due to the visually similar nature of some\nplants. The extraction of these plants from nonmedicinal plants requires human\nexpert intervention. To solve the issue of accurate plant identification and\nreduce the need for a human expert in the collection process; employing\ncomputer vision methods will be efficient and beneficial. In this paper, we\nhave proposed a model that solves such issues. The proposed model is a custom\nconvolutional neural network (CNN) architecture with 6 convolution layers,\nmax-pooling layers, and dense layers. The model was tested on three different\ndatasets named Indian Medicinal Leaves Image Dataset,MED117 Medicinal Plant\nLeaf Dataset, and the self-curated dataset by the authors. The proposed model\nachieved respective accuracies of 99.5%, 98.4%, and 99.7% using various\noptimizers including Adam, RMSprop, and SGD with momentum.\n","authors":["Deepjyoti Chetia","Sanjib Kr Kalita","Prof Partha Pratim Baruah","Debasish Dutta","Tanaz Akhter"],"pdf_url":"https://arxiv.org/pdf/2501.09363v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09361v1","updated":"2025-01-16T08:17:32Z","published":"2025-01-16T08:17:32Z","title":"Strategic Base Representation Learning via Feature Augmentations for\n Few-Shot Class Incremental Learning","summary":" Few-shot class incremental learning implies the model to learn new classes\nwhile retaining knowledge of previously learned classes with a small number of\ntraining instances. Existing frameworks typically freeze the parameters of the\npreviously learned classes during the incorporation of new classes. However,\nthis approach often results in suboptimal class separation of previously\nlearned classes, leading to overlap between old and new classes. Consequently,\nthe performance of old classes degrades on new classes. To address these\nchallenges, we propose a novel feature augmentation driven contrastive learning\nframework designed to enhance the separation of previously learned classes to\naccommodate new classes. Our approach involves augmenting feature vectors and\nassigning proxy labels to these vectors. This strategy expands the feature\nspace, ensuring seamless integration of new classes within the expanded space.\nAdditionally, we employ a self-supervised contrastive loss to improve the\nseparation between previous classes. We validate our framework through\nexperiments on three FSCIL benchmark datasets: CIFAR100, miniImageNet, and\nCUB200. The results demonstrate that our Feature Augmentation driven\nContrastive Learning framework significantly outperforms other approaches,\nachieving state-of-the-art performance.\n","authors":["Parinita Nema","Vinod K Kurmi"],"pdf_url":"https://arxiv.org/pdf/2501.09361v1.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2407.21035v2","updated":"2025-01-16T08:08:57Z","published":"2024-07-17T08:19:11Z","title":"Direct Unlearning Optimization for Robust and Safe Text-to-Image Models","summary":" Recent advancements in text-to-image (T2I) models have unlocked a wide range\nof applications but also present significant risks, particularly in their\npotential to generate unsafe content. To mitigate this issue, researchers have\ndeveloped unlearning techniques to remove the model's ability to generate\npotentially harmful content. However, these methods are easily bypassed by\nadversarial attacks, making them unreliable for ensuring the safety of\ngenerated images. In this paper, we propose Direct Unlearning Optimization\n(DUO), a novel framework for removing Not Safe For Work (NSFW) content from T2I\nmodels while preserving their performance on unrelated topics. DUO employs a\npreference optimization approach using curated paired image data, ensuring that\nthe model learns to remove unsafe visual concepts while retaining unrelated\nfeatures. Furthermore, we introduce an output-preserving regularization term to\nmaintain the model's generative capabilities on safe content. Extensive\nexperiments demonstrate that DUO can robustly defend against various\nstate-of-the-art red teaming methods without significant performance\ndegradation on unrelated topics, as measured by FID and CLIP scores. Our work\ncontributes to the development of safer and more reliable T2I models, paving\nthe way for their responsible deployment in both closed-source and open-source\nscenarios.\n","authors":["Yong-Hyun Park","Sangdoo Yun","Jin-Hwa Kim","Junho Kim","Geonhui Jang","Yonghyun Jeong","Junghyo Jo","Gayoung Lee"],"pdf_url":"https://arxiv.org/pdf/2407.21035v2.pdf","comment":"This paper has been accepted for NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.10869v2","updated":"2025-01-16T08:06:16Z","published":"2024-06-16T09:38:33Z","title":"Geometric Distortion Guided Transformer for Omnidirectional Image\n Super-Resolution","summary":" As virtual and augmented reality applications gain popularity,\nomnidirectional image (ODI) super-resolution has become increasingly important.\nUnlike 2D plain images that are formed on a plane, ODIs are projected onto\nspherical surfaces. Applying established image super-resolution methods to\nODIs, therefore, requires performing equirectangular projection (ERP) to map\nthe ODIs onto a plane. ODI super-resolution needs to take into account\ngeometric distortion resulting from ERP. However, without considering such\ngeometric distortion of ERP images, previous deep-learning-based methods only\nutilize a limited range of pixels and may easily miss self-similar textures for\nreconstruction. In this paper, we introduce a novel Geometric Distortion Guided\nTransformer for Omnidirectional image Super-Resolution (GDGT-OSR).\nSpecifically, a distortion modulated rectangle-window self-attention mechanism,\nintegrated with deformable self-attention, is proposed to better perceive the\ndistortion and thus involve more self-similar textures. Distortion modulation\nis achieved through a newly devised distortion guidance generator that produces\nguidance by exploiting the variability of distortion across latitudes.\nFurthermore, we propose a dynamic feature aggregation scheme to adaptively fuse\nthe features from different self-attention modules. We present extensive\nexperimental results on public datasets and show that the new GDGT-OSR\noutperforms methods in existing literature.\n","authors":["Cuixin Yang","Rongkang Dong","Jun Xiao","Cong Zhang","Kin-Man Lam","Fei Zhou","Guoping Qiu"],"pdf_url":"https://arxiv.org/pdf/2406.10869v2.pdf","comment":"13 pages, 12 figures, journal"},{"id":"http://arxiv.org/abs/2501.09355v1","updated":"2025-01-16T08:06:02Z","published":"2025-01-16T08:06:02Z","title":"YETI (YET to Intervene) Proactive Interventions by Multimodal AI Agents\n in Augmented Reality Tasks","summary":" Multimodal AI Agents are AI models that have the capability of interactively\nand cooperatively assisting human users to solve day-to-day tasks. Augmented\nReality (AR) head worn devices can uniquely improve the user experience of\nsolving procedural day-to-day tasks by providing egocentric multimodal (audio\nand video) observational capabilities to AI Agents. Such AR capabilities can\nhelp AI Agents see and listen to actions that users take which can relate to\nmultimodal capabilities of human users. Existing AI Agents, either Large\nLanguage Models (LLMs) or Multimodal Vision-Language Models (VLMs) are reactive\nin nature, which means that models cannot take an action without reading or\nlistening to the human user's prompts. Proactivity of AI Agents on the other\nhand can help the human user detect and correct any mistakes in agent observed\ntasks, encourage users when they do tasks correctly or simply engage in\nconversation with the user - akin to a human teaching or assisting a user. Our\nproposed YET to Intervene (YETI) multimodal agent focuses on the research\nquestion of identifying circumstances that may require the agent to intervene\nproactively. This allows the agent to understand when it can intervene in a\nconversation with human users that can help the user correct mistakes on tasks,\nlike cooking, using AR. Our YETI Agent learns scene understanding signals based\non interpretable notions of Structural Similarity (SSIM) on consecutive video\nframes. We also define the alignment signal which the AI Agent can learn to\nidentify if the video frames corresponding to the user's actions on the task\nare consistent with expected actions. These signals are used by our AI Agent to\ndetermine when it should proactively intervene. We compare our results on the\ninstances of proactive intervention in the HoloAssist multimodal benchmark for\nan expert agent guiding a user to complete procedural tasks.\n","authors":["Saptarashmi Bandyopadhyay","Vikas Bahirwani","Lavisha Aggarwal","Bhanu Guda","Lin Li","Andrea Colaco"],"pdf_url":"https://arxiv.org/pdf/2501.09355v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2501.09350v1","updated":"2025-01-16T08:03:49Z","published":"2025-01-16T08:03:49Z","title":"Making Your Dreams A Reality: Decoding the Dreams into a Coherent Video\n Story from fMRI Signals","summary":" This paper studies the brave new idea for Multimedia community, and proposes\na novel framework to convert dreams into coherent video narratives using fMRI\ndata. Essentially, dreams have intrigued humanity for centuries, offering\nglimpses into our subconscious minds. Recent advancements in brain imaging,\nparticularly functional magnetic resonance imaging (fMRI), have provided new\nways to explore the neural basis of dreaming. By combining subjective dream\nexperiences with objective neurophysiological data, we aim to understand the\nvisual aspects of dreams and create complete video narratives. Our process\ninvolves three main steps: reconstructing visual perception, decoding dream\nimagery, and integrating dream stories. Using innovative techniques in fMRI\nanalysis and language modeling, we seek to push the boundaries of dream\nresearch and gain deeper insights into visual experiences during sleep. This\ntechnical report introduces a novel approach to visually decoding dreams using\nfMRI signals and weaving dream visuals into narratives using language models.\nWe gather a dataset of dreams along with descriptions to assess the\neffectiveness of our framework.\n","authors":["Yanwei Fu","Jianxiong Gao","Baofeng Yang","Jianfeng Feng"],"pdf_url":"https://arxiv.org/pdf/2501.09350v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2501.09347v1","updated":"2025-01-16T08:00:17Z","published":"2025-01-16T08:00:17Z","title":"UVRM: A Scalable 3D Reconstruction Model from Unposed Videos","summary":" Large Reconstruction Models (LRMs) have recently become a popular method for\ncreating 3D foundational models. Training 3D reconstruction models with 2D\nvisual data traditionally requires prior knowledge of camera poses for the\ntraining samples, a process that is both time-consuming and prone to errors.\nConsequently, 3D reconstruction training has been confined to either synthetic\n3D datasets or small-scale datasets with annotated poses. In this study, we\ninvestigate the feasibility of 3D reconstruction using unposed video data of\nvarious objects. We introduce UVRM, a novel 3D reconstruction model capable of\nbeing trained and evaluated on monocular videos without requiring any\ninformation about the pose. UVRM uses a transformer network to implicitly\naggregate video frames into a pose-invariant latent feature space, which is\nthen decoded into a tri-plane 3D representation. To obviate the need for\nground-truth pose annotations during training, UVRM employs a combination of\nthe score distillation sampling (SDS) method and an analysis-by-synthesis\napproach, progressively synthesizing pseudo novel-views using a pre-trained\ndiffusion model. We qualitatively and quantitatively evaluate UVRM's\nperformance on the G-Objaverse and CO3D datasets without relying on pose\ninformation. Extensive experiments show that UVRM is capable of effectively and\nefficiently reconstructing a wide range of 3D objects from unposed videos.\n","authors":["Shiu-hong Kao","Xiao Li","Jinglu Wang","Chi-Keung Tang","Yu-Wing Tai","Yan Lu"],"pdf_url":"https://arxiv.org/pdf/2501.09347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.04390v2","updated":"2025-01-16T07:58:06Z","published":"2025-01-08T10:08:09Z","title":"iFADIT: Invertible Face Anonymization via Disentangled Identity\n Transform","summary":" Face anonymization aims to conceal the visual identity of a face to safeguard\nthe individual's privacy. Traditional methods like blurring and pixelation can\nlargely remove identifying features, but these techniques significantly degrade\nimage quality and are vulnerable to deep reconstruction attacks. Generative\nmodels have emerged as a promising solution for anonymizing faces while\npreserving a natural appearance. However, many still face limitations in visual\nquality and often overlook the potential to recover the original face from the\nanonymized version, which can be valuable in specific contexts such as image\nforensics. This paper proposes a novel framework named iFADIT, an acronym for\nInvertible Face Anonymization via Disentangled Identity Transform. The\nframework features a disentanglement architecture coupled with a secure\nflow-based model: the former decouples identity information from\nnon-identifying attributes, while the latter transforms the decoupled identity\ninto an anonymized version in an invertible manner controlled by a secret key.\nThe anonymized face can then be reconstructed based on a pre-trained StyleGAN\nthat ensures high image quality and realistic facial details. Recovery of the\noriginal face (aka de-anonymization) is possible upon the availability of the\nmatching secret, by inverting the anonymization process based on the same set\nof model parameters. Furthermore, a dedicated secret-key mechanism along with a\ndual-phase training strategy is devised to ensure the desired properties of\nface anonymization. Qualitative and quantitative experiments demonstrate the\nsuperiority of the proposed approach in anonymity, reversibility, security,\ndiversity, and interpretability over competing methods.\n","authors":["Lin Yuan","Kai Liang","Xiong Li","Tao Wu","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2501.04390v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09341v1","updated":"2025-01-16T07:50:56Z","published":"2025-01-16T07:50:56Z","title":"SE-BSFV: Online Subspace Learning based Shadow Enhancement and\n Background Suppression for ViSAR under Complex Background","summary":" Video synthetic aperture radar (ViSAR) has attracted substantial attention in\nthe moving target detection (MTD) field due to its ability to continuously\nmonitor changes in the target area. In ViSAR, the moving targets' shadows will\nnot offset and defocus, which is widely used as a feature for MTD. However, the\nshadows are difficult to distinguish from the low scattering region in the\nbackground, which will cause more missing and false alarms. Therefore, it is\nworth investigating how to enhance the distinction between the shadows and\nbackground. In this study, we proposed the Shadow Enhancement and Background\nSuppression for ViSAR (SE-BSFV) algorithm. The SE-BSFV algorithm is based on\nthe low-rank representation (LRR) theory and adopts online subspace learning\ntechnique to enhance shadows and suppress background for ViSAR images. Firstly,\nwe use a registration algorithm to register the ViSAR images and utilize\nGaussian mixture distribution (GMD) to model the ViSAR data. Secondly, the\nknowledge learned from the previous frames is leveraged to estimate the GMD\nparameters of the current frame, and the Expectation-maximization (EM)\nalgorithm is used to estimate the subspace parameters. Then, the foreground\nmatrix of the current frame can be obtained. Finally, the alternating direction\nmethod of multipliers (ADMM) is used to eliminate strong scattering objects in\nthe foreground matrix to obtain the final results. The experimental results\nindicate that the SE-BSFV algorithm significantly enhances the shadows'\nsaliency and greatly improves the detection performance while ensuring\nefficiency compared with several other advanced pre-processing algorithms.\n","authors":["Shangqu Yan","Chenyang Luo","Yaowen Fu","Wenpeng Zhang","Wei Yang","Ruofeng Yu"],"pdf_url":"https://arxiv.org/pdf/2501.09341v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.08331v2","updated":"2025-01-16T07:43:19Z","published":"2025-01-14T18:59:10Z","title":"Go-with-the-Flow: Motion-Controllable Video Diffusion Models Using\n Real-Time Warped Noise","summary":" Generative modeling aims to transform random noise into structured outputs.\nIn this work, we enhance video diffusion models by allowing motion control via\nstructured latent noise sampling. This is achieved by just a change in data: we\npre-process training videos to yield structured noise. Consequently, our method\nis agnostic to diffusion model design, requiring no changes to model\narchitectures or training pipelines. Specifically, we propose a novel noise\nwarping algorithm, fast enough to run in real time, that replaces random\ntemporal Gaussianity with correlated warped noise derived from optical flow\nfields, while preserving the spatial Gaussianity. The efficiency of our\nalgorithm enables us to fine-tune modern video diffusion base models using\nwarped noise with minimal overhead, and provide a one-stop solution for a wide\nrange of user-friendly motion control: local object motion control, global\ncamera movement control, and motion transfer. The harmonization between\ntemporal coherence and spatial Gaussianity in our warped noise leads to\neffective motion control while maintaining per-frame pixel quality. Extensive\nexperiments and user studies demonstrate the advantages of our method, making\nit a robust and scalable approach for controlling motion in video diffusion\nmodels. Video results are available on our webpage:\nhttps://vgenai-netflix-eyeline-research.github.io/Go-with-the-Flow. Source code\nand model checkpoints are available on GitHub:\nhttps://github.com/VGenAI-Netflix-Eyeline-Research/Go-with-the-Flow.\n","authors":["Ryan Burgert","Yuancheng Xu","Wenqi Xian","Oliver Pilarski","Pascal Clausen","Mingming He","Li Ma","Yitong Deng","Lingxiao Li","Mohsen Mousavi","Michael Ryoo","Paul Debevec","Ning Yu"],"pdf_url":"https://arxiv.org/pdf/2501.08331v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20406v3","updated":"2025-01-16T07:26:52Z","published":"2024-10-27T10:35:47Z","title":"Point-PRC: A Prompt Learning Based Regulation Framework for\n Generalizable Point Cloud Analysis","summary":" This paper investigates the 3D domain generalization (3DDG) ability of large\n3D models based on prevalent prompt learning. Recent works demonstrate the\nperformances of 3D point cloud recognition can be boosted remarkably by\nparameter-efficient prompt tuning. However, we observe that the improvement on\ndownstream tasks comes at the expense of a severe drop in 3D domain\ngeneralization. To resolve this challenge, we present a comprehensive\nregulation framework that allows the learnable prompts to actively interact\nwith the well-learned general knowledge in large 3D models to maintain good\ngeneralization. Specifically, the proposed framework imposes multiple explicit\nconstraints on the prompt learning trajectory by maximizing the mutual\nagreement between task-specific predictions and task-agnostic knowledge. We\ndesign the regulation framework as a plug-and-play module to embed into\nexisting representative large 3D models. Surprisingly, our method not only\nrealizes consistently increasing generalization ability but also enhances\ntask-specific 3D recognition performances across various 3DDG benchmarks by a\nclear margin. Considering the lack of study and evaluation on 3DDG, we also\ncreate three new benchmarks, namely base-to-new, cross-dataset and few-shot\ngeneralization benchmarks, to enrich the field and inspire future research.\nCode and benchmarks are available at\n\\url{https://github.com/auniquesun/Point-PRC}.\n","authors":["Hongyu Sun","Qiuhong Ke","Yongcai Wang","Wang Chen","Kang Yang","Deying Li","Jianfei Cai"],"pdf_url":"https://arxiv.org/pdf/2410.20406v3.pdf","comment":"5 figures, 14 tables; accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.09333v1","updated":"2025-01-16T07:07:41Z","published":"2025-01-16T07:07:41Z","title":"Prompt-CAM: A Simpler Interpretable Transformer for Fine-Grained\n Analysis","summary":" We present a simple usage of pre-trained Vision Transformers (ViTs) for\nfine-grained analysis, aiming to identify and localize the traits that\ndistinguish visually similar categories, such as different bird species or dog\nbreeds. Pre-trained ViTs such as DINO have shown remarkable capabilities to\nextract localized, informative features. However, using saliency maps like\nGrad-CAM can hardly point out the traits: they often locate the whole object by\na blurred, coarse heatmap, not traits. We propose a novel approach Prompt Class\nAttention Map (Prompt-CAM) to the rescue. Prompt-CAM learns class-specific\nprompts to a pre-trained ViT and uses the corresponding outputs for\nclassification. To classify an image correctly, the true-class prompt must\nattend to the unique image patches not seen in other classes' images, i.e.,\ntraits. As such, the true class's multi-head attention maps reveal traits and\ntheir locations. Implementation-wise, Prompt-CAM is almost a free lunch by\nsimply modifying the prediction head of Visual Prompt Tuning (VPT). This makes\nPrompt-CAM fairly easy to train and apply, sharply contrasting other\ninterpretable methods that design specific models and training processes. It is\neven simpler than the recently published INterpretable TRansformer (INTR),\nwhose encoder-decoder architecture prevents it from leveraging pre-trained\nViTs. Extensive empirical studies on a dozen datasets from various domains\n(e.g., birds, fishes, insects, fungi, flowers, food, and cars) validate\nPrompt-CAM superior interpretation capability.\n","authors":["Arpita Chowdhury","Dipanjyoti Paul","Zheda Mai","Jianyang Gu","Ziheng Zhang","Kazi Sajeed Mehrab","Elizabeth G. Campolongo","Daniel Rubenstein","Charles V. Stewart","Anuj Karpatne","Tanya Berger-Wolf","Yu Su","Wei-Lun Chao"],"pdf_url":"https://arxiv.org/pdf/2501.09333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19043v2","updated":"2025-01-16T06:46:18Z","published":"2024-06-27T09:50:20Z","title":"CMRxRecon2024: A Multi-Modality, Multi-View K-Space Dataset Boosting\n Universal Machine Learning for Accelerated Cardiac MRI","summary":" Cardiac magnetic resonance imaging (MRI) has emerged as a clinically\ngold-standard technique for diagnosing cardiac diseases, thanks to its ability\nto provide diverse information with multiple modalities and anatomical views.\nAccelerated cardiac MRI is highly expected to achieve time-efficient and\npatient-friendly imaging, and then advanced image reconstruction approaches are\nrequired to recover high-quality, clinically interpretable images from\nundersampled measurements. However, the lack of publicly available cardiac MRI\nk-space dataset in terms of both quantity and diversity has severely hindered\nsubstantial technological progress, particularly for data-driven artificial\nintelligence. Here, we provide a standardized, diverse, and high-quality\nCMRxRecon2024 dataset to facilitate the technical development, fair evaluation,\nand clinical transfer of cardiac MRI reconstruction approaches, towards\npromoting the universal frameworks that enable fast and robust reconstructions\nacross different cardiac MRI protocols in clinical practice. To the best of our\nknowledge, the CMRxRecon2024 dataset is the largest and most protocal-diverse\npublicly available cardiac k-space dataset. It is acquired from 330 healthy\nvolunteers, covering commonly used modalities, anatomical views, and\nacquisition trajectories in clinical cardiac MRI workflows. Besides, an open\nplatform with tutorials, benchmarks, and data processing tools is provided to\nfacilitate data usage, advanced method development, and fair performance\nevaluation.\n","authors":["Zi Wang","Fanwen Wang","Chen Qin","Jun Lyu","Cheng Ouyang","Shuo Wang","Yan Li","Mengyao Yu","Haoyu Zhang","Kunyuan Guo","Zhang Shi","Qirong Li","Ziqiang Xu","Yajing Zhang","Hao Li","Sha Hua","Binghua Chen","Longyu Sun","Mengting Sun","Qin Li","Ying-Hua Chu","Wenjia Bai","Jing Qin","Xiahai Zhuang","Claudia Prieto","Alistair Young","Michael Markl","He Wang","Lianming Wu","Guang Yang","Xiaobo Qu","Chengyan Wang"],"pdf_url":"https://arxiv.org/pdf/2406.19043v2.pdf","comment":"23 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.09321v1","updated":"2025-01-16T06:25:56Z","published":"2025-01-16T06:25:56Z","title":"Soft Knowledge Distillation with Multi-Dimensional Cross-Net Attention\n for Image Restoration Models Compression","summary":" Transformer-based encoder-decoder models have achieved remarkable success in\nimage-to-image transfer tasks, particularly in image restoration. However,\ntheir high computational complexity-manifested in elevated FLOPs and parameter\ncounts-limits their application in real-world scenarios. Existing knowledge\ndistillation methods in image restoration typically employ lightweight student\nmodels that directly mimic the intermediate features and reconstruction results\nof the teacher, overlooking the implicit attention relationships between them.\nTo address this, we propose a Soft Knowledge Distillation (SKD) strategy that\nincorporates a Multi-dimensional Cross-net Attention (MCA) mechanism for\ncompressing image restoration models. This mechanism facilitates interaction\nbetween the student and teacher across both channel and spatial dimensions,\nenabling the student to implicitly learn the attention matrices. Additionally,\nwe employ a Gaussian kernel function to measure the distance between student\nand teacher features in kernel space, ensuring stable and efficient feature\nlearning. To further enhance the quality of reconstructed images, we replace\nthe commonly used L1 or KL divergence loss with a contrastive learning loss at\nthe image level. Experiments on three tasks-image deraining, deblurring, and\ndenoising-demonstrate that our SKD strategy significantly reduces computational\ncomplexity while maintaining strong image restoration capabilities.\n","authors":["Yongheng Zhang","Danfeng Yan"],"pdf_url":"https://arxiv.org/pdf/2501.09321v1.pdf","comment":"Accepted by ICASSP2025"},{"id":"http://arxiv.org/abs/2501.09311v1","updated":"2025-01-16T05:58:32Z","published":"2025-01-16T05:58:32Z","title":"Shape-Based Single Object Classification Using Ensemble Method\n Classifiers","summary":" Nowadays, more and more images are available. Annotation and retrieval of the\nimages pose classification problems, where each class is defined as the group\nof database images labelled with a common semantic label. Various systems have\nbeen proposed for content-based retrieval, as well as for image classification\nand indexing. In this paper, a hierarchical classification framework has been\nproposed for bridging the semantic gap effectively and achieving multi-category\nimage classification. A well known pre-processing and post-processing method\nwas used and applied to three problems; image segmentation, object\nidentification and image classification. The method was applied to classify\nsingle object images from Amazon and Google datasets. The classification was\ntested for four different classifiers; BayesNetwork (BN), Random Forest (RF),\nBagging and Vote. The estimated classification accuracies ranged from 20% to\n99% (using 10-fold cross validation). The Bagging classifier presents the best\nperformance, followed by the Random Forest classifier.\n","authors":["Nur Shazwani Kamarudin","Mokhairi Makhtar","Syadiah Nor Wan Shamsuddin","Syed Abdullah Fadzli"],"pdf_url":"https://arxiv.org/pdf/2501.09311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01432v3","updated":"2025-01-16T05:42:28Z","published":"2024-07-18T19:44:44Z","title":"VLG-CBM: Training Concept Bottleneck Models with Vision-Language\n Guidance","summary":" Concept Bottleneck Models (CBMs) provide interpretable prediction by\nintroducing an intermediate Concept Bottleneck Layer (CBL), which encodes\nhuman-understandable concepts to explain models' decision. Recent works\nproposed to utilize Large Language Models and pre-trained Vision-Language\nModels to automate the training of CBMs, making it more scalable and automated.\nHowever, existing approaches still fall short in two aspects: First, the\nconcepts predicted by CBL often mismatch the input image, raising doubts about\nthe faithfulness of interpretation. Second, it has been shown that concept\nvalues encode unintended information: even a set of random concepts could\nachieve comparable test accuracy to state-of-the-art CBMs. To address these\ncritical limitations, in this work, we propose a novel framework called\nVision-Language-Guided Concept Bottleneck Model (VLG-CBM) to enable faithful\ninterpretability with the benefits of boosted performance. Our method leverages\noff-the-shelf open-domain grounded object detectors to provide visually\ngrounded concept annotation, which largely enhances the faithfulness of concept\nprediction while further improving the model performance. In addition, we\npropose a new metric called Number of Effective Concepts (NEC) to control the\ninformation leakage and provide better interpretability. Extensive evaluations\nacross five standard benchmarks show that our method, VLG-CBM, outperforms\nexisting methods by at least 4.27% and up to 51.09% on Accuracy at NEC=5\n(denoted as ANEC-5), and by at least 0.45% and up to 29.78% on average accuracy\n(denoted as ANEC-avg), while preserving both faithfulness and interpretability\nof the learned concepts as demonstrated in extensive experiments.\n","authors":["Divyansh Srivastava","Ge Yan","Tsui-Wei Weng"],"pdf_url":"https://arxiv.org/pdf/2408.01432v3.pdf","comment":"Appeared at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2501.09305v1","updated":"2025-01-16T05:39:50Z","published":"2025-01-16T05:39:50Z","title":"Domain-conditioned and Temporal-guided Diffusion Modeling for\n Accelerated Dynamic MRI Reconstruction","summary":" Purpose: To propose a domain-conditioned and temporal-guided diffusion\nmodeling method, termed dynamic Diffusion Modeling (dDiMo), for accelerated\ndynamic MRI reconstruction, enabling diffusion process to characterize\nspatiotemporal information for time-resolved multi-coil Cartesian and\nnon-Cartesian data. Methods: The dDiMo framework integrates temporal\ninformation from time-resolved dimensions, allowing for the concurrent capture\nof intra-frame spatial features and inter-frame temporal dynamics in diffusion\nmodeling. It employs additional spatiotemporal ($x$-$t$) and self-consistent\nfrequency-temporal ($k$-$t$) priors to guide the diffusion process. This\napproach ensures precise temporal alignment and enhances the recovery of fine\nimage details. To facilitate a smooth diffusion process, the nonlinear\nconjugate gradient algorithm is utilized during the reverse diffusion steps.\nThe proposed model was tested on two types of MRI data: Cartesian-acquired\nmulti-coil cardiac MRI and Golden-Angle-Radial-acquired multi-coil\nfree-breathing lung MRI, across various undersampling rates. Results: dDiMo\nachieved high-quality reconstructions at various acceleration factors,\ndemonstrating improved temporal alignment and structural recovery compared to\nother competitive reconstruction methods, both qualitatively and\nquantitatively. This proposed diffusion framework exhibited robust performance\nin handling both Cartesian and non-Cartesian acquisitions, effectively\nreconstructing dynamic datasets in cardiac and lung MRI under different imaging\nconditions. Conclusion: This study introduces a novel diffusion modeling method\nfor dynamic MRI reconstruction.\n","authors":["Liping Zhang","Iris Yuwen Zhou","Sydney B. Montesi","Li Feng","Fang Liu"],"pdf_url":"https://arxiv.org/pdf/2501.09305v1.pdf","comment":"21 pages, 15 figures, 2 tables"},{"id":"http://arxiv.org/abs/2501.09304v1","updated":"2025-01-16T05:39:28Z","published":"2025-01-16T05:39:28Z","title":"Finding the Trigger: Causal Abductive Reasoning on Video Events","summary":" This paper introduces a new problem, Causal Abductive Reasoning on Video\nEvents (CARVE), which involves identifying causal relationships between events\nin a video and generating hypotheses about causal chains that account for the\noccurrence of a target event. To facilitate research in this direction, we\ncreate two new benchmark datasets with both synthetic and realistic videos,\naccompanied by trigger-target labels generated through a novel counterfactual\nsynthesis approach. To explore the challenge of solving CARVE, we present a\nCausal Event Relation Network (CERN) that examines the relationships between\nvideo events in temporal and semantic spaces to efficiently determine the\nroot-cause trigger events. Through extensive experiments, we demonstrate the\ncritical roles of event relational representation learning and interaction\nmodeling in solving video causal reasoning challenges. The introduction of the\nCARVE task, along with the accompanying datasets and the CERN framework, will\nadvance future research on video causal reasoning and significantly facilitate\nvarious applications, including video surveillance, root-cause analysis and\nmovie content management.\n","authors":["Thao Minh Le","Vuong Le","Kien Do","Sunil Gupta","Svetha Venkatesh","Truyen Tran"],"pdf_url":"https://arxiv.org/pdf/2501.09304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09302v1","updated":"2025-01-16T05:37:29Z","published":"2025-01-16T05:37:29Z","title":"Creating Virtual Environments with 3D Gaussian Splatting: A Comparative\n Study","summary":" 3D Gaussian Splatting (3DGS) has recently emerged as an innovative and\nefficient 3D representation technique. While its potential for extended reality\n(XR) applications is frequently highlighted, its practical effectiveness\nremains underexplored. In this work, we examine three distinct 3DGS-based\napproaches for virtual environment (VE) creation, leveraging their unique\nstrengths for efficient and visually compelling scene representation. By\nconducting a comparable study, we evaluate the feasibility of 3DGS in creating\nimmersive VEs, identify its limitations in XR applications, and discuss future\nresearch and development opportunities.\n","authors":["Shi Qiu","Binzhu Xie","Qixuan Liu","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2501.09302v1.pdf","comment":"IEEE VR 2025 Posters"},{"id":"http://arxiv.org/abs/2501.09294v1","updated":"2025-01-16T05:01:30Z","published":"2025-01-16T05:01:30Z","title":"Efficient Few-Shot Medical Image Analysis via Hierarchical Contrastive\n Vision-Language Learning","summary":" Few-shot learning in medical image classification presents a significant\nchallenge due to the limited availability of annotated data and the complex\nnature of medical imagery. In this work, we propose Adaptive Vision-Language\nFine-tuning with Hierarchical Contrastive Alignment (HiCA), a novel framework\nthat leverages the capabilities of Large Vision-Language Models (LVLMs) for\nmedical image analysis. HiCA introduces a two-stage fine-tuning strategy,\ncombining domain-specific pretraining and hierarchical contrastive learning to\nalign visual and textual representations at multiple levels. We evaluate our\napproach on two benchmark datasets, Chest X-ray and Breast Ultrasound,\nachieving state-of-the-art performance in both few-shot and zero-shot settings.\nFurther analyses demonstrate the robustness, generalizability, and\ninterpretability of our method, with substantial improvements in performance\ncompared to existing baselines. Our work highlights the potential of\nhierarchical contrastive strategies in adapting LVLMs to the unique challenges\nof medical imaging tasks.\n","authors":["Harrison Fuller","Fernando Gabriela Garcia","Victor Flores"],"pdf_url":"https://arxiv.org/pdf/2501.09294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03789v3","updated":"2025-01-16T04:13:10Z","published":"2023-07-07T18:28:44Z","title":"Synthesizing Forestry Images Conditioned on Plant Phenotype Using a\n Generative Adversarial Network","summary":" Plant phenology and phenotype prediction using remote sensing data are\nincreasingly gaining attention within the plant science community as a\npromising approach to enhance agricultural productivity. This work focuses on\ngenerating synthetic forestry images that satisfy certain phenotypic\nattributes, viz. canopy greenness. We harness a Generative Adversarial Network\n(GAN) to synthesize biologically plausible and phenotypically stable forestry\nimages conditioned on the greenness of vegetation (a continuous attribute) over\na specific region of interest, describing a particular vegetation type in a\nmixed forest. The training data is based on the automated digital camera\nimagery provided by the National Ecological Observatory Network (NEON) and\nprocessed by the PhenoCam Network. Our method helps render the appearance of\nforest sites specific to a greenness value. The synthetic images are\nsubsequently utilized to predict another phenotypic attribute, viz., redness of\nplants. The quality of the synthetic images is assessed using the Structural\nSIMilarity (SSIM) index and Fr\\'echet Inception Distance (FID). Further, the\ngreenness and redness indices of the synthetic images are compared against\nthose of the original images using Root Mean Squared Percentage Error (RMSPE)\nto evaluate their accuracy and integrity. The generalizability and scalability\nof our proposed GAN model are established by effectively transforming it to\ngenerate synthetic images for other forest sites and vegetation types. From a\nbroader perspective, this approach could be leveraged to visualize forestry\nbased on different phenotypic attributes in the context of various\nenvironmental parameters.\n","authors":["Debasmita Pal","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2307.03789v3.pdf","comment":"Accepted to Pattern Recognition journal"},{"id":"http://arxiv.org/abs/2501.09281v1","updated":"2025-01-16T04:06:59Z","published":"2025-01-16T04:06:59Z","title":"SoccerSynth-Detection: A Synthetic Dataset for Soccer Player Detection","summary":" In soccer video analysis, player detection is essential for identifying key\nevents and reconstructing tactical positions. The presence of numerous players\nand frequent occlusions, combined with copyright restrictions, severely\nrestricts the availability of datasets, leaving limited options such as\nSoccerNet-Tracking and SportsMOT. These datasets suffer from a lack of\ndiversity, which hinders algorithms from adapting effectively to varied soccer\nvideo contexts. To address these challenges, we developed\nSoccerSynth-Detection, the first synthetic dataset designed for the detection\nof synthetic soccer players. It includes a broad range of random lighting and\ntextures, as well as simulated camera motion blur. We validated its efficacy\nusing the object detection model (Yolov8n) against real-world datasets\n(SoccerNet-Tracking and SportsMoT). In transfer tests, it matched the\nperformance of real datasets and significantly outperformed them in images with\nmotion blur; in pre-training tests, it demonstrated its efficacy as a\npre-training dataset, significantly enhancing the algorithm's overall\nperformance. Our work demonstrates the potential of synthetic datasets to\nreplace real datasets for algorithm training in the field of soccer video\nanalysis.\n","authors":["Haobin Qin","Calvin Yeung","Rikuhei Umemoto","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2501.09281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09278v1","updated":"2025-01-16T03:54:06Z","published":"2025-01-16T03:54:06Z","title":"Text-guided Synthetic Geometric Augmentation for Zero-shot 3D\n Understanding","summary":" Zero-shot recognition models require extensive training data for\ngeneralization. However, in zero-shot 3D classification, collecting 3D data and\ncaptions is costly and laborintensive, posing a significant barrier compared to\n2D vision. Recent advances in generative models have achieved unprecedented\nrealism in synthetic data production, and recent research shows the potential\nfor using generated data as training data. Here, naturally raising the\nquestion: Can synthetic 3D data generated by generative models be used as\nexpanding limited 3D datasets? In response, we present a synthetic 3D dataset\nexpansion method, Textguided Geometric Augmentation (TeGA). TeGA is tailored\nfor language-image-3D pretraining, which achieves SoTA in zero-shot 3D\nclassification, and uses a generative textto-3D model to enhance and extend\nlimited 3D datasets. Specifically, we automatically generate text-guided\nsynthetic 3D data and introduce a consistency filtering strategy to discard\nnoisy samples where semantics and geometric shapes do not match with text. In\nthe experiment to double the original dataset size using TeGA, our approach\ndemonstrates improvements over the baselines, achieving zeroshot performance\ngains of 3.0% on Objaverse-LVIS, 4.6% on ScanObjectNN, and 8.7% on ModelNet40.\nThese results demonstrate that TeGA effectively bridges the 3D data gap,\nenabling robust zero-shot 3D classification even with limited real training\ndata and paving the way for zero-shot 3D vision application.\n","authors":["Kohei Torimi","Ryosuke Yamada","Daichi Otsuka","Kensho Hara","Yuki M. Asano","Hirokatsu Kataoka","Yoshimitsu Aoki"],"pdf_url":"https://arxiv.org/pdf/2501.09278v1.pdf","comment":"14 pages, 8 figures, this paper is submitted to CVPR"},{"id":"http://arxiv.org/abs/2501.08659v2","updated":"2025-01-16T03:51:49Z","published":"2025-01-15T08:50:52Z","title":"BRIGHT-VO: Brightness-Guided Hybrid Transformer for Visual Odometry with\n Multi-modality Refinement Module","summary":" Visual odometry (VO) plays a crucial role in autonomous driving, robotic\nnavigation, and other related tasks by estimating the position and orientation\nof a camera based on visual input. Significant progress has been made in\ndata-driven VO methods, particularly those leveraging deep learning techniques\nto extract image features and estimate camera poses. However, these methods\noften struggle in low-light conditions because of the reduced visibility of\nfeatures and the increased difficulty of matching keypoints. To address this\nlimitation, we introduce BrightVO, a novel VO model based on Transformer\narchitecture, which not only performs front-end visual feature extraction, but\nalso incorporates a multi-modality refinement module in the back-end that\nintegrates Inertial Measurement Unit (IMU) data. Using pose graph optimization,\nthis module iteratively refines pose estimates to reduce errors and improve\nboth accuracy and robustness. Furthermore, we create a synthetic low-light\ndataset, KiC4R, which includes a variety of lighting conditions to facilitate\nthe training and evaluation of VO frameworks in challenging environments.\nExperimental results demonstrate that BrightVO achieves state-of-the-art\nperformance on both the KiC4R dataset and the KITTI benchmarks. Specifically,\nit provides an average improvement of 20% in pose estimation accuracy in normal\noutdoor environments and 259% in low-light conditions, outperforming existing\nmethods. For widespread use and further development, the research work is fully\nopen-source at https://github.com/Anastasiawd/BrightVO.\n","authors":["Dongzhihan Wang","Yang Yang","Liang Xu"],"pdf_url":"https://arxiv.org/pdf/2501.08659v2.pdf","comment":"We have identified significant issues in the methodology and data\n analysis that impact the validity of our conclusions"},{"id":"http://arxiv.org/abs/2501.09277v1","updated":"2025-01-16T03:47:25Z","published":"2025-01-16T03:47:25Z","title":"Bias for Action: Video Implicit Neural Representations with Bias\n Modulation","summary":" We propose a new continuous video modeling framework based on implicit neural\nrepresentations (INRs) called ActINR. At the core of our approach is the\nobservation that INRs can be considered as a learnable dictionary, with the\nshapes of the basis functions governed by the weights of the INR, and their\nlocations governed by the biases. Given compact non-linear activation\nfunctions, we hypothesize that an INR's biases are suitable to capture motion\nacross images, and facilitate compact representations for video sequences.\nUsing these observations, we design ActINR to share INR weights across frames\nof a video sequence, while using unique biases for each frame. We further model\nthe biases as the output of a separate INR conditioned on time index to promote\nsmoothness. By training the video INR and this bias INR together, we\ndemonstrate unique capabilities, including $10\\times$ video slow motion,\n$4\\times$ spatial super resolution along with $2\\times$ slow motion, denoising,\nand video inpainting. ActINR performs remarkably well across numerous video\nprocessing tasks (often achieving more than 6dB improvement), setting a new\nstandard for continuous modeling of videos.\n","authors":["Alper Kayabasi","Anil Kumar Vadathya","Guha Balakrishnan","Vishwanath Saragadam"],"pdf_url":"https://arxiv.org/pdf/2501.09277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09268v1","updated":"2025-01-16T03:35:23Z","published":"2025-01-16T03:35:23Z","title":"Knowledge Distillation for Image Restoration : Simultaneous Learning\n from Degraded and Clean Images","summary":" Model compression through knowledge distillation has seen extensive\napplication in classification and segmentation tasks. However, its potential in\nimage-to-image translation, particularly in image restoration, remains\nunderexplored. To address this gap, we propose a Simultaneous Learning\nKnowledge Distillation (SLKD) framework tailored for model compression in image\nrestoration tasks. SLKD employs a dual-teacher, single-student architecture\nwith two distinct learning strategies: Degradation Removal Learning (DRL) and\nImage Reconstruction Learning (IRL), simultaneously. In DRL, the student\nencoder learns from Teacher A to focus on removing degradation factors, guided\nby a novel BRISQUE extractor. In IRL, the student decoder learns from Teacher B\nto reconstruct clean images, with the assistance of a proposed PIQE extractor.\nThese strategies enable the student to learn from degraded and clean images\nsimultaneously, ensuring high-quality compression of image restoration models.\nExperimental results across five datasets and three tasks demonstrate that SLKD\nachieves substantial reductions in FLOPs and parameters, exceeding 80\\%, while\nmaintaining strong image restoration performance.\n","authors":["Yongheng Zhang","Danfeng Yan"],"pdf_url":"https://arxiv.org/pdf/2501.09268v1.pdf","comment":"Accepted by ICASSP2025"},{"id":"http://arxiv.org/abs/2501.09267v1","updated":"2025-01-16T03:34:36Z","published":"2025-01-16T03:34:36Z","title":"Are Open-Vocabulary Models Ready for Detection of MEP Elements on\n Construction Sites","summary":" The construction industry has long explored robotics and computer vision, yet\ntheir deployment on construction sites remains very limited. These technologies\nhave the potential to revolutionize traditional workflows by enhancing\naccuracy, efficiency, and safety in construction management. Ground robots\nequipped with advanced vision systems could automate tasks such as monitoring\nmechanical, electrical, and plumbing (MEP) systems. The present research\nevaluates the applicability of open-vocabulary vision-language models compared\nto fine-tuned, lightweight, closed-set object detectors for detecting MEP\ncomponents using a mobile ground robotic platform. A dataset collected with\ncameras mounted on a ground robot was manually annotated and analyzed to\ncompare model performance. The results demonstrate that, despite the\nversatility of vision-language models, fine-tuned lightweight models still\nlargely outperform them in specialized environments and for domain-specific\ntasks.\n","authors":["Abdalwhab Abdalwhab","Ali Imran","Sina Heydarian","Ivanka Iordanova","David St-Onge"],"pdf_url":"https://arxiv.org/pdf/2501.09267v1.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2501.06848v3","updated":"2025-01-16T03:18:14Z","published":"2025-01-12T15:34:24Z","title":"A General Framework for Inference-time Scaling and Steering of Diffusion\n Models","summary":" Diffusion models produce impressive results in modalities ranging from images\nand video to protein design and text. However, generating samples with\nuser-specified properties remains a challenge. Recent research proposes\nfine-tuning models to maximize rewards that capture desired properties, but\nthese methods require expensive training and are prone to mode collapse. In\nthis work, we propose Feynman Kac (FK) steering, an inference-time framework\nfor steering diffusion models with reward functions. FK steering works by\nsampling a system of multiple interacting diffusion processes, called\nparticles, and resampling particles at intermediate steps based on scores\ncomputed using functions called potentials. Potentials are defined using\nrewards for intermediate states and are selected such that a high value\nindicates that the particle will yield a high-reward sample. We explore various\nchoices of potentials, intermediate rewards, and samplers. We evaluate FK\nsteering on text-to-image and text diffusion models. For steering text-to-image\nmodels with a human preference reward, we find that FK steering a 0.8B\nparameter model outperforms a 2.6B parameter fine-tuned model on prompt\nfidelity, with faster sampling and no training. For steering text diffusion\nmodels with rewards for text quality and specific text attributes, we find that\nFK steering generates lower perplexity, more linguistically acceptable outputs\nand enables gradient-free control of attributes like toxicity. Our results\ndemonstrate that inference-time scaling and steering of diffusion models, even\nwith off-the-shelf rewards, can provide significant sample quality gains and\ncontrollability benefits. Code is available at\nhttps://github.com/zacharyhorvitz/Fk-Diffusion-Steering .\n","authors":["Raghav Singhal","Zachary Horvitz","Ryan Teehan","Mengye Ren","Zhou Yu","Kathleen McKeown","Rajesh Ranganath"],"pdf_url":"https://arxiv.org/pdf/2501.06848v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09259v1","updated":"2025-01-16T03:02:08Z","published":"2025-01-16T03:02:08Z","title":"OpticFusion: Multi-Modal Neural Implicit 3D Reconstruction of\n Microstructures by Fusing White Light Interferometry and Optical Microscopy","summary":" White Light Interferometry (WLI) is a precise optical tool for measuring the\n3D topography of microstructures. However, conventional WLI cannot capture the\nnatural color of a sample's surface, which is essential for many microscale\nresearch applications that require both 3D geometry and color information.\nPrevious methods have attempted to overcome this limitation by modifying WLI\nhardware and analysis software, but these solutions are often costly. In this\nwork, we address this challenge from a computer vision multi-modal\nreconstruction perspective for the first time. We introduce OpticFusion, a\nnovel approach that uses an additional digital optical microscope (OM) to\nachieve 3D reconstruction with natural color textures using multi-view WLI and\nOM images. Our method employs a two-step data association process to obtain the\nposes of WLI and OM data. By leveraging the neural implicit representation, we\nfuse multi-modal data and apply color decomposition technology to extract the\nsample's natural color. Tested on our multi-modal dataset of various microscale\nsamples, OpticFusion achieves detailed 3D reconstructions with color textures.\nOur method provides an effective tool for practical applications across\nnumerous microscale research fields. The source code and our real-world dataset\nare available at https://github.com/zju3dv/OpticFusion.\n","authors":["Shuo Chen","Yijin Li","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09259v1.pdf","comment":"3DV 2025"},{"id":"http://arxiv.org/abs/2303.13397v6","updated":"2025-01-16T02:48:38Z","published":"2023-03-23T16:15:18Z","title":"DiffMesh: A Motion-aware Diffusion Framework for Human Mesh Recovery\n from Videos","summary":" Human mesh recovery (HMR) provides rich human body information for various\nreal-world applications. While image-based HMR methods have achieved impressive\nresults, they often struggle to recover humans in dynamic scenarios, leading to\ntemporal inconsistencies and non-smooth 3D motion predictions due to the\nabsence of human motion. In contrast, video-based approaches leverage temporal\ninformation to mitigate this issue. In this paper, we present DiffMesh, an\ninnovative motion-aware Diffusion-like framework for video-based HMR. DiffMesh\nestablishes a bridge between diffusion models and human motion, efficiently\ngenerating accurate and smooth output mesh sequences by incorporating human\nmotion within the forward process and reverse process in the diffusion model.\nExtensive experiments are conducted on the widely used datasets (Human3.6M\n\\cite{h36m_pami} and 3DPW \\cite{pw3d2018}), which demonstrate the effectiveness\nand efficiency of our DiffMesh. Visual comparisons in real-world scenarios\nfurther highlight DiffMesh's suitability for practical applications.\n","authors":["Ce Zheng","Xianpeng Liu","Qucheng Peng","Tianfu Wu","Pu Wang","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2303.13397v6.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2501.05264v3","updated":"2025-01-16T02:39:20Z","published":"2025-01-09T14:19:33Z","title":"Towards Balanced Continual Multi-Modal Learning in Human Pose Estimation","summary":" 3D human pose estimation (3D HPE) has emerged as a prominent research topic,\nparticularly in the realm of RGB-based methods. However, RGB images are\nsusceptible to limitations such as sensitivity to lighting conditions and\npotential user discomfort. Consequently, multi-modal sensing, which leverages\nnon-intrusive sensors, is gaining increasing attention. Nevertheless,\nmulti-modal 3D HPE still faces challenges, including modality imbalance and the\nimperative for continual learning. In this work, we introduce a novel balanced\ncontinual multi-modal learning method for 3D HPE, which harnesses the power of\nRGB, LiDAR, mmWave, and WiFi. Specifically, we propose a Shapley value-based\ncontribution algorithm to quantify the contribution of each modality and\nidentify modality imbalance. To address this imbalance, we employ a re-learning\nstrategy. Furthermore, recognizing that raw data is prone to noise\ncontamination, we develop a novel denoising continual learning approach. This\napproach incorporates a noise identification and separation module to mitigate\nthe adverse effects of noise and collaborates with the balanced learning\nstrategy to enhance optimization. Additionally, an adaptive EWC mechanism is\nemployed to alleviate catastrophic forgetting. We conduct extensive experiments\non the widely-adopted multi-modal dataset, MM-Fi, which demonstrate the\nsuperiority of our approach in boosting 3D pose estimation and mitigating\ncatastrophic forgetting in complex scenarios. We will release our codes.\n","authors":["Jiaxuan Peng","Mengshi Qi","Dong Zhao","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2501.05264v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00689v4","updated":"2025-01-16T02:12:45Z","published":"2023-11-01T17:45:22Z","title":"Collaboration in Immersive Environments: Challenges and Solutions","summary":" Virtual Reality (VR) and Augmented Reality (AR) tools have been applied in\nall engineering fields in order to avoid the use of physical prototypes, to\ntrain in high-risk situations, and to interpret real or simulated results. In\norder to complete a shared task or assign tasks to the agents in such immersive\nenvironments, collaboration or Shared Cooperative Activities are a necessity.\nCollaboration in immersive environments is an emerging field of research that\naims to study and enhance the ways in which people interact and work together\nin Virtual and Augmented Reality settings. Collaboration in immersive\nenvironments is a complex process that involves different factors such as\ncommunication, coordination, and social presence. This paper provides an\noverview of the current state of research on collaboration in immersive\nenvironments. It discusses the different types of immersive environments,\nincluding VR and AR, and the different forms of collaboration that can occur in\nthese environments. The paper also highlights the challenges and limitations of\ncollaboration in immersive environments, such as the lack of physical cues,\ncost and usability and the need for further research in this area. Overall,\ncollaboration in immersive environments is a promising field with a wide range\nof potential applications, from education to industry, and it can benefit both\nindividuals and groups by enhancing their ability to work together effectively.\n","authors":["Shahin Doroudian"],"pdf_url":"https://arxiv.org/pdf/2311.00689v4.pdf","comment":"Added new references in Networking section"},{"id":"http://arxiv.org/abs/2408.01167v3","updated":"2025-01-16T02:09:15Z","published":"2024-08-02T10:34:23Z","title":"Rethinking Pre-Trained Feature Extractor Selection in Multiple Instance\n Learning for Whole Slide Image Classification","summary":" Multiple instance learning (MIL) has become a preferred method for gigapixel\nwhole slide image (WSI) classification without requiring patch-level\nannotations. Current MIL research primarily relies on embedding-based\napproaches, which extract patch features using a pre-trained feature extractor\nand aggregate them for slide-level prediction. Despite the critical role of\nfeature extraction, there is limited guidance on selecting optimal feature\nextractors to maximize WSI performance. This study addresses this gap by\nsystematically evaluating MIL feature extractors across three dimensions:\npre-training dataset, backbone model, and pre-training method. Extensive\nexperiments were conducted on two public WSI datasets (TCGA-NSCLC and\nCamelyon16) using four state-of-the-art (SOTA) MIL models. Our findings reveal\nthat: 1) selecting a robust self-supervised learning (SSL) method has a greater\nimpact on performance than relying solely on an in-domain pre-training dataset;\n2) prioritizing Transformer-based backbones with deeper architectures over\nCNN-based models; and 3) using larger, more diverse pre-training datasets\nsignificantly enhances classification outcomes. We hope that these insights can\nprovide practical guidance for optimizing WSI classification and explain the\nreasons behind the performance advantages of the current SOTA pathology\nfoundation models. Furthermore, this work may inform the development of more\neffective pathology foundation models. Our code is publicly available at\nhttps://github.com/bryanwong17/MIL-Feature-Extractor-Selection\n","authors":["Bryan Wong","Mun Yong Yi"],"pdf_url":"https://arxiv.org/pdf/2408.01167v3.pdf","comment":"Accepted to IEEE International Symposium on Biomedical Imaging (ISBI)\n 2025"},{"id":"http://arxiv.org/abs/2408.01077v3","updated":"2025-01-16T02:08:47Z","published":"2024-08-02T07:52:28Z","title":"PhysMamba: State Space Duality Model for Remote Physiological\n Measurement","summary":" Remote Photoplethysmography (rPPG) enables non-contact physiological signal\nextraction from facial videos, offering applications in psychological state\nanalysis, medical assistance, and anti-face spoofing. However, challenges such\nas motion artifacts, lighting variations, and noise limit its real-world\napplicability. To address these issues, we propose PhysMamba, a novel\ndual-pathway time-frequency interaction model based on Synergistic State Space\nDuality (SSSD), which for the first time integrates state space models with\nattention mechanisms in a dual-branch framework. Combined with a Multi-Scale\nQuery (MQ) mechanism, PhysMamba achieves efficient information exchange and\nenhanced feature representation, ensuring robustness under noisy and dynamic\nconditions. Experiments on PURE, UBFC-rPPG, and MMPD datasets demonstrate that\nPhysMamba outperforms state-of-the-art methods, offering superior accuracy and\ngeneralization. This work lays a strong foundation for practical applications\nin non-contact health monitoring, including real-time remote patient care.\n","authors":["Zhixin Yan","Yan Zhong","Hongbin Xu","Wenjun Zhang","Shangru Yi","Lin Shu","Wenxiong Kang"],"pdf_url":"https://arxiv.org/pdf/2408.01077v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10377v4","updated":"2025-01-16T01:30:35Z","published":"2024-07-15T01:11:30Z","title":"Enhanced Masked Image Modeling to Avoid Model Collapse on Multi-modal\n MRI Datasets","summary":" Multi-modal magnetic resonance imaging (MRI) provides information of lesions\nfor computer-aided diagnosis from different views. Deep learning algorithms are\nsuitable for identifying specific anatomical structures, segmenting lesions,\nand classifying diseases. Manual labels are limited due to the high expense,\nwhich hinders further improvement of accuracy. Self-supervised learning,\nparticularly masked image modeling (MIM), has shown promise in utilizing\nunlabeled data. However, we spot model collapse when applying MIM to\nmulti-modal MRI datasets. The performance of downstream tasks does not see any\nimprovement following the collapsed model. To solve model collapse, we analyze\nand address it in two types: complete collapse and dimensional collapse. We\nfind complete collapse occurs because the collapsed loss value in multi-modal\nMRI datasets falls below the normally converged loss value. Based on this, the\nhybrid mask pattern (HMP) masking strategy is introduced to elevate the\ncollapsed loss above the normally converged loss value and avoid complete\ncollapse. Additionally, we reveal that dimensional collapse stems from\ninsufficient feature uniformity in MIM. We mitigate dimensional collapse by\nintroducing the pyramid barlow twins (PBT) module as an explicit regularization\nmethod. Overall, we construct the enhanced MIM (E-MIM) with HMP and PBT module\nto avoid model collapse multi-modal MRI. Experiments are conducted on three\nmulti-modal MRI datasets to validate the effectiveness of our approach in\npreventing both types of model collapse. By preventing model collapse, the\ntraining of the model becomes more stable, resulting in a decent improvement in\nperformance for segmentation and classification tasks. The code is available at\nhttps://github.com/LinxuanHan/E-MIM.\n","authors":["Linxuan Han","Sa Xiao","Zimeng Li","Haidong Li","Xiuchao Zhao","Yeqing Han","Fumin Guo","Xin Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.10377v4.pdf","comment":"This work has been submitted to the lEEE for possible publication.\n copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2408.05526v2","updated":"2025-01-16T00:54:04Z","published":"2024-08-10T11:48:14Z","title":"CryoBench: Diverse and challenging datasets for the heterogeneity\n problem in cryo-EM","summary":" Cryo-electron microscopy (cryo-EM) is a powerful technique for determining\nhigh-resolution 3D biomolecular structures from imaging data. Its unique\nability to capture structural variability has spurred the development of\nheterogeneous reconstruction algorithms that can infer distributions of 3D\nstructures from noisy, unlabeled imaging data. Despite the growing number of\nadvanced methods, progress in the field is hindered by the lack of standardized\nbenchmarks with ground truth information and reliable validation metrics. Here,\nwe introduce CryoBench, a suite of datasets, metrics, and benchmarks for\nheterogeneous reconstruction in cryo-EM. CryoBench includes five datasets\nrepresenting different sources of heterogeneity and degrees of difficulty.\nThese include conformational heterogeneity generated from designed motions of\nantibody complexes or sampled from a molecular dynamics simulation, as well as\ncompositional heterogeneity from mixtures of ribosome assembly states or 100\ncommon complexes present in cells. We then analyze state-of-the-art\nheterogeneous reconstruction tools, including neural and non-neural methods,\nassess their sensitivity to noise, and propose new metrics for quantitative\nevaluation. We hope that CryoBench will be a foundational resource for\naccelerating algorithmic development and evaluation in the cryo-EM and machine\nlearning communities. Project page: https://cryobench.cs.princeton.edu.\n","authors":["Minkyu Jeon","Rishwanth Raghu","Miro Astore","Geoffrey Woollard","Ryan Feathers","Alkin Kaz","Sonya M. Hanson","Pilar Cossio","Ellen D. Zhong"],"pdf_url":"https://arxiv.org/pdf/2408.05526v2.pdf","comment":"Accepted by NeurIPS 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2501.09221v1","updated":"2025-01-16T00:45:05Z","published":"2025-01-16T00:45:05Z","title":"Leveraging Scale-aware Representations for improved\n Concept-Representation Alignment in ViTs","summary":" Vision Transformers (ViTs) are increasingly being adopted in various\nsensitive vision applications - like medical diagnosis, facial recognition,\netc. To improve the interpretability of such models, many approaches attempt to\nforward-align them with carefully annotated abstract, human-understandable\nsemantic entities - concepts. Concepts provide global rationales to the model\npredictions and can be quickly understood/intervened on by domain experts. Most\ncurrent research focuses on designing model-agnostic, plug-and-play generic\nconcept-based explainability modules that do not incorporate the inner workings\nof foundation models (e.g., inductive biases, scale invariance, etc.) during\ntraining. To alleviate this issue for ViTs, in this paper, we propose a novel\nConcept Representation Alignment Module (CRAM) which learns both scale and\nposition-aware representations from multi-scale feature pyramids and patch\nrepresentations respectively. CRAM further aligns these representations with\nconcept annotations through an attention matrix. The proposed CRAM module\nimproves the predictive performance of ViT architectures and also provides\naccurate and robust concept explanations as demonstrated on five datasets -\nincluding three widely used benchmarks (CUB, Pascal APY, Concept-MNIST) and 2\nreal-world datasets (AWA2, KITS).\n","authors":["Sanchit Sinha","Guangzhi Xiong","Aidong Zhang"],"pdf_url":"https://arxiv.org/pdf/2501.09221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09217v1","updated":"2025-01-16T00:33:01Z","published":"2025-01-16T00:33:01Z","title":"Adaptive Law-Based Transformation (ALT): A Lightweight Feature\n Representation for Time Series Classification","summary":" Time series classification (TSC) is fundamental in numerous domains,\nincluding finance, healthcare, and environmental monitoring. However,\ntraditional TSC methods often struggle with the inherent complexity and\nvariability of time series data. Building on our previous work with the linear\nlaw-based transformation (LLT) - which improved classification accuracy by\ntransforming the feature space based on key data patterns - we introduce\nadaptive law-based transformation (ALT). ALT enhances LLT by incorporating\nvariable-length shifted time windows, enabling it to capture distinguishing\npatterns of various lengths and thereby handle complex time series more\neffectively. By mapping features into a linearly separable space, ALT provides\na fast, robust, and transparent solution that achieves state-of-the-art\nperformance with only a few hyperparameters.\n","authors":["Marcell T. Kurbucz","Balázs Hajós","Balázs P. Halmos","Vince Á. Molnár","Antal Jakovác"],"pdf_url":"https://arxiv.org/pdf/2501.09217v1.pdf","comment":"8 pages, 1 figure, 5 tables"},{"id":"http://arxiv.org/abs/2405.03762v3","updated":"2025-01-16T00:22:53Z","published":"2024-05-06T18:01:13Z","title":"Swin transformers are robust to distribution and concept drift in\n endoscopy-based longitudinal rectal cancer assessment","summary":" Endoscopic images are used at various stages of rectal cancer treatment\nstarting from cancer screening, diagnosis, during treatment to assess response\nand toxicity from treatments such as colitis, and at follow up to detect new\ntumor or local regrowth (LR). However, subjective assessment is highly variable\nand can underestimate the degree of response in some patients, subjecting them\nto unnecessary surgery, or overestimate response that places patients at risk\nof disease spread. Advances in deep learning has shown the ability to produce\nconsistent and objective response assessment for endoscopic images. However,\nmethods for detecting cancers, regrowth, and monitoring response during the\nentire course of patient treatment and follow-up are lacking. This is because,\nautomated diagnosis and rectal cancer response assessment requires methods that\nare robust to inherent imaging illumination variations and confounding\nconditions (blood, scope, blurring) present in endoscopy images as well as\nchanges to the normal lumen and tumor during treatment. Hence, a hierarchical\nshifted window (Swin) transformer was trained to distinguish rectal cancer from\nnormal lumen using endoscopy images. Swin as well as two convolutional\n(ResNet-50, WideResNet-50), and vision transformer (ViT) models were trained\nand evaluated on follow-up longitudinal images to detect LR on private dataset\nas well as on out-of-distribution (OOD) public colonoscopy datasets to detect\npre/non-cancerous polyps. Color shifts were applied using optimal transport to\nsimulate distribution shifts. Swin and ResNet models were similarly accurate in\nthe in-distribution dataset. Swin was more accurate than other methods\n(follow-up: 0.84, OOD: 0.83) even when subject to color shifts (follow-up:\n0.83, OOD: 0.87), indicating capability to provide robust performance for\nlongitudinal cancer assessment.\n","authors":["Jorge Tapias Gomez","Aneesh Rangnekar","Hannah Williams","Hannah Thompson","Julio Garcia-Aguilar","Joshua Jesse Smith","Harini Veeraraghavan"],"pdf_url":"https://arxiv.org/pdf/2405.03762v3.pdf","comment":"The work has been accepted for publication in 2024 SPIE Medical\n Imaging conference proceedings"},{"id":"http://arxiv.org/abs/2501.09209v1","updated":"2025-01-16T00:03:04Z","published":"2025-01-16T00:03:04Z","title":"Surgical Visual Understanding (SurgVU) Dataset","summary":" Owing to recent advances in machine learning and the ability to harvest large\namounts of data during robotic-assisted surgeries, surgical data science is\nripe for foundational work. We present a large dataset of surgical videos and\ntheir accompanying labels for this purpose. We describe how the data was\ncollected and some of its unique attributes. Multiple example problems are\noutlined. Although the dataset was curated for a particular set of scientific\nchallenges (in an accompanying paper), it is general enough to be used for a\nbroad range machine learning questions. Our hope is that this dataset exposes\nthe larger machine learning community to the challenging problems within\nsurgical data science, and becomes a touchstone for future research. The videos\nare available at\nhttps://storage.googleapis.com/isi-surgvu/surgvu24_videos_only.zip, the labels\nat https://storage.googleapis.com/isi-surgvu/surgvu24_labels_updated_v2.zip,\nand a validation set for tool detection problem at\nhttps://storage.googleapis.com/isi-surgvu/cat1_test_set_public.zip.\n","authors":["Aneeq Zia","Max Berniker","Rogerio Nespolo","Conor Perreault","Ziheng Wang","Benjamin Mueller","Ryan Schmidt","Kiran Bhattacharyya","Xi Liu","Anthony Jarc"],"pdf_url":"https://arxiv.org/pdf/2501.09209v1.pdf","comment":null}]},"2025-01-17T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2501.10356v1","updated":"2025-01-17T18:57:39Z","published":"2025-01-17T18:57:39Z","title":"DexForce: Extracting Force-informed Actions from Kinesthetic\n Demonstrations for Dexterous Manipulation","summary":" Imitation learning requires high-quality demonstrations consisting of\nsequences of state-action pairs. For contact-rich dexterous manipulation tasks\nthat require fine-grained dexterity, the actions in these state-action pairs\nmust produce the right forces. Current widely-used methods for collecting\ndexterous manipulation demonstrations are difficult to use for demonstrating\ncontact-rich tasks due to unintuitive human-to-robot motion retargeting and the\nlack of direct haptic feedback. Motivated by this, we propose DexForce, a\nmethod for collecting demonstrations of contact-rich dexterous manipulation.\nDexForce leverages contact forces, measured during kinesthetic demonstrations,\nto compute force-informed actions for policy learning. We use DexForce to\ncollect demonstrations for six tasks and show that policies trained on our\nforce-informed actions achieve an average success rate of 76% across all tasks.\nIn contrast, policies trained directly on actions that do not account for\ncontact forces have near-zero success rates. We also conduct a study ablating\nthe inclusion of force data in policy observations. We find that while using\nforce data never hurts policy performance, it helps the most for tasks that\nrequire an advanced level of precision and coordination, like opening an\nAirPods case and unscrewing a nut.\n","authors":["Claire Chen","Zhongchun Yu","Hojung Choi","Mark Cutkosky","Jeannette Bohg"],"pdf_url":"https://arxiv.org/pdf/2501.10356v1.pdf","comment":"Videos can be found here:\n https://clairelc.github.io/dexforce.github.io/"},{"id":"http://arxiv.org/abs/2411.06627v2","updated":"2025-01-17T18:38:58Z","published":"2024-11-10T23:22:49Z","title":"Optimal Virtual Model Control for Robotics: Design and Tuning of\n Passivity-Based Controllers","summary":" Passivity-based control is a cornerstone of control theory and an established\ndesign approach in robotics. Its strength is based on the passivity theorem,\nwhich provides a powerful interconnection framework for robotics. However, the\ndesign of passivity-based controllers and their optimal tuning remain\nchallenging. We propose here an intuitive design approach for fully actuated\nrobots, where the control action is determined by a `virtual-mechanism' as in\nclassical virtual model control. The result is a robot whose controlled\nbehavior can be understood in terms of physics. We achieve optimal tuning by\napplying algorithmic differentiation to ODE simulations of the rigid body\ndynamics. Overall, this leads to a flexible design and optimization approach:\nstability is proven by passivity of the virtual mechanism, while performance is\nobtained by optimization using algorithmic differentiation.\n","authors":["Daniel Larby","Fulvio Forni"],"pdf_url":"https://arxiv.org/pdf/2411.06627v2.pdf","comment":"14 pages, 17 figures"},{"id":"http://arxiv.org/abs/2501.09600v2","updated":"2025-01-17T17:07:31Z","published":"2025-01-16T15:22:06Z","title":"Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid\n Prototyping in Virtual Reality Applications","summary":" SLAM is a foundational technique with broad applications in robotics and\nAR/VR. SLAM simulations evaluate new concepts, but testing on\nresource-constrained devices, such as VR HMDs, faces challenges: high\ncomputational cost and restricted sensor data access. This work proposes a\nsparse framework using mesh geometry projections as features, which improves\nefficiency and circumvents direct sensor data access, advancing SLAM research\nas we demonstrate in VR and through numerical evaluation.\n","authors":["Carlos Augusto Pinheiro de Sousa","Heiko Hamann","Oliver Deussen"],"pdf_url":"https://arxiv.org/pdf/2501.09600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.07600v4","updated":"2025-01-17T16:52:03Z","published":"2022-09-15T20:27:54Z","title":"STPOTR: Simultaneous Human Trajectory and Pose Prediction Using a\n Non-Autoregressive Transformer for Robot Following Ahead","summary":" In this paper, we develop a neural network model to predict future human\nmotion from an observed human motion history. We propose a non-autoregressive\ntransformer architecture to leverage its parallel nature for easier training\nand fast, accurate predictions at test time. The proposed architecture divides\nhuman motion prediction into two parts: 1) the human trajectory, which is the\nhip joint 3D position over time and 2) the human pose which is the all other\njoints 3D positions over time with respect to a fixed hip joint. We propose to\nmake the two predictions simultaneously, as the shared representation can\nimprove the model performance. Therefore, the model consists of two sets of\nencoders and decoders. First, a multi-head attention module applied to encoder\noutputs improves human trajectory. Second, another multi-head self-attention\nmodule applied to encoder outputs concatenated with decoder outputs facilitates\nlearning of temporal dependencies. Our model is well-suited for robotic\napplications in terms of test accuracy and speed, and compares favorably with\nrespect to state-of-the-art methods. We demonstrate the real-world\napplicability of our work via the Robot Follow-Ahead task, a challenging yet\npractical case study for our proposed model.\n","authors":["Mohammad Mahdavian","Payam Nikdel","Mahdi TaherAhmadi","Mo Chen"],"pdf_url":"https://arxiv.org/pdf/2209.07600v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.09374v2","updated":"2025-01-17T15:52:06Z","published":"2024-10-12T05:35:27Z","title":"ESVO2: Direct Visual-Inertial Odometry with Stereo Event Cameras","summary":" Event-based visual odometry is a specific branch of visual Simultaneous\nLocalization and Mapping (SLAM) techniques, which aims at solving tracking and\nmapping subproblems (typically in parallel), by exploiting the special working\nprinciples of neuromorphic (i.e., event-based) cameras. Due to the\nmotion-dependent nature of event data, explicit data association (i.e., feature\nmatching) under large-baseline view-point changes is difficult to establish,\nmaking direct methods a more rational choice. However, state-of-the-art direct\nmethods are limited by the high computational complexity of the mapping\nsub-problem and the degeneracy of camera pose tracking in certain degrees of\nfreedom (DoF) in rotation. In this paper, we tackle these issues by building an\nevent-based stereo visual-inertial odometry system on top of a direct pipeline.\nSpecifically, to speed up the mapping operation, we propose an efficient\nstrategy for sampling contour points according to the local dynamics of events.\nThe mapping performance is also improved in terms of structure completeness and\nlocal smoothness by merging the temporal stereo and static stereo results. To\ncircumvent the degeneracy of camera pose tracking in recovering the pitch and\nyaw components of general 6-DoF motion, we introduce IMU measurements as motion\npriors via pre-integration. To this end, a compact back-end is proposed for\ncontinuously updating the IMU bias and predicting the linear velocity, enabling\nan accurate motion prediction for camera pose tracking. The resulting system\nscales well with modern high-resolution event cameras and leads to better\nglobal positioning accuracy in large-scale outdoor environments. Extensive\nevaluations on five publicly available datasets featuring different resolutions\nand scenarios justify the superior performance of the proposed system against\nfive state-of-the-art methods.\n","authors":["Junkai Niu","Sheng Zhong","Xiuyuan Lu","Shaojie Shen","Guillermo Gallego","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.09374v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10262v1","updated":"2025-01-17T15:43:49Z","published":"2025-01-17T15:43:49Z","title":"Deployment of an Aerial Multi-agent System for Automated Task Execution\n in Large-scale Underground Mining Environments","summary":" In this article, we present a framework for deploying an aerial multi-agent\nsystem in large-scale subterranean environments with minimal infrastructure for\nsupporting multi-agent operations. The multi-agent objective is to optimally\nand reactively allocate and execute inspection tasks in a mine, which are\nentered by a mine operator on-the-fly. The assignment of currently available\ntasks to the team of agents is accomplished through an auction-based system,\nwhere the agents bid for available tasks, which are used by a central\nauctioneer to optimally assigns tasks to agents. A mobile Wi-Fi mesh supports\ninter-agent communication and bi-directional communication between the agents\nand the task allocator, while the task execution is performed completely\ninfrastructure-free. Given a task to be accomplished, a reliable and modular\nagent behavior is synthesized by generating behavior trees from a pool of agent\ncapabilities, using a back-chaining approach. The auction system in the\nproposed framework is reactive and supports addition of new operator-specified\ntasks on-the-go, at any point through a user-friendly operator interface. The\nframework has been validated in a real underground mining environment using\nthree aerial agents, with several inspection locations spread in an environment\nof almost 200 meters. The proposed framework can be utilized for missions\ninvolving rapid inspection, gas detection, distributed sensing and mapping etc.\nin a subterranean environment. The proposed framework and its field deployment\ncontributes towards furthering reliable automation in large-scale subterranean\nenvironments to offload both routine and dangerous tasks from human operators\nto autonomous aerial robots.\n","authors":["Niklas Dahlquist","Samuel Nordström","Nikolaos Stathoulopoulos","Björn Lindqvist","Akshit Saradagi","George Nikolakopoulos"],"pdf_url":"https://arxiv.org/pdf/2501.10262v1.pdf","comment":"Submitted to IEEE Transactions on Field Robotics"},{"id":"http://arxiv.org/abs/2405.04392v2","updated":"2025-01-17T15:21:52Z","published":"2024-05-07T15:14:49Z","title":"BILTS: A Bi-Invariant Similarity Measure for Robust Object Trajectory\n Recognition under Reference Frame Variations","summary":" When similar object motions are performed in diverse contexts but are meant\nto be recognized under a single classification, these contextual variations act\nas disturbances that negatively affect accurate motion recognition. In this\npaper, we focus on contextual variations caused by reference frame variations.\nTo robustly deal with these variations, similarity measures have been\nintroduced that compare object motion trajectories in a context-invariant\nmanner. However, most are highly sensitive to noise near singularities, where\nthe measure is not uniquely defined, and lack bi-invariance (invariance to both\nworld and body frame variations). To address these issues, we propose the novel\n\\textit{Bi-Invariant Local Trajectory-Shape Similarity} (BILTS) measure.\nCompared to other measures, the BILTS measure uniquely offers bi-invariance,\nboundedness, and third-order shape identity. Aimed at practical\nimplementations, we devised a discretized and regularized version of the BILTS\nmeasure which shows exceptional robustness to singularities. This is\ndemonstrated through rigorous recognition experiments using multiple datasets.\nOn average, BILTS attained the highest recognition ratio and least sensitivity\nto contextual variations compared to other invariant object motion similarity\nmeasures. We believe that the BILTS measure is a valuable tool for recognizing\nmotions performed in diverse contexts and has potential in other applications,\nincluding the recognition, segmentation, and adaptation of both motion and\nforce trajectories.\n","authors":["Arno Verduyn","Erwin Aertbeliën","Glenn Maes","Joris De Schutter","Maxim Vochten"],"pdf_url":"https://arxiv.org/pdf/2405.04392v2.pdf","comment":"This work has been submitted as a regular research paper for\n consideration in the Journal of Intelligent & Robotic Systems. The content in\n this preprint is identical to the version submitted for peer review, except\n for formatting differences required by the journal"},{"id":"http://arxiv.org/abs/2501.10156v1","updated":"2025-01-17T12:41:51Z","published":"2025-01-17T12:41:51Z","title":"Tethered Variable Inertial Attitude Control Mechanisms through a Modular\n Jumping Limbed Robot","summary":" This paper presents the concept of a tethered variable inertial attitude\ncontrol mechanism for a modular jumping-limbed robot designed for planetary\nexploration in low-gravity environments. The system, named SPLITTER, comprises\ntwo sub-10 kg quadrupedal robots connected by a tether, capable of executing\nsuccessive jumping gaits and stabilizing in-flight using inertial morphing\ntechnology. Through model predictive control (MPC), attitude control was\ndemonstrated by adjusting the limbs and tether length to modulate the system's\nprincipal moments of inertia. Our results indicate that this control strategy\nallows the robot to stabilize during flight phases without needing traditional\nflywheel-based systems or relying on aerodynamics, making the approach\nmass-efficient and ideal for small-scale planetary robots' successive jumps.\nThe paper outlines the dynamics, MPC formulation for inertial morphing,\nactuator requirements, and simulation results, illustrating the potential of\nagile exploration for small-scale rovers in low-gravity environments like the\nMoon or asteroids.\n","authors":["Yusuke Tanaka","Alvin Zhu","Dennis Hong"],"pdf_url":"https://arxiv.org/pdf/2501.10156v1.pdf","comment":"Proceeding to IEEE Aerospace Conference 2025"},{"id":"http://arxiv.org/abs/2412.19567v2","updated":"2025-01-17T12:10:42Z","published":"2024-12-27T10:10:52Z","title":"Safe Interval Randomized Path Planning For Manipulators","summary":" Planning safe paths in 3D workspace for high DoF robotic systems, such as\nmanipulators, is a challenging problem, especially when the environment is\npopulated with the dynamic obstacles that need to be avoided. In this case the\ntime dimension should be taken into account that further increases the\ncomplexity of planning. To mitigate this issue we suggest to combine\nsafe-interval path planning (a prominent technique in heuristic search) with\nthe randomized planning, specifically, with the bidirectional rapidly-exploring\nrandom trees (RRT-Connect) - a fast and efficient algorithm for\nhigh-dimensional planning. Leveraging a dedicated technique of fast computation\nof the safe intervals we end up with an efficient planner dubbed SI-RRT. We\ncompare it with the state of the art and show that SI-RRT consistently\noutperforms the competitors both in runtime and solution cost.\n Our implementation of SI-RRT is publicly available at\nhttps://github.com/PathPlanning/ManipulationPlanning-SI-RRT\n","authors":["Nuraddin Kerimov","Aleksandr Onegin","Konstantin Yakovlev"],"pdf_url":"https://arxiv.org/pdf/2412.19567v2.pdf","comment":"Submitted to The 35th International Conference on Automated Planning\n and Scheduling (ICAPS 2025)"},{"id":"http://arxiv.org/abs/2501.10105v1","updated":"2025-01-17T10:45:22Z","published":"2025-01-17T10:45:22Z","title":"Universal Actions for Enhanced Embodied Foundation Models","summary":" Training on diverse, internet-scale data is a key factor in the success of\nrecent large foundation models. Yet, using the same recipe for building\nembodied agents has faced noticeable difficulties. Despite the availability of\nmany crowd-sourced embodied datasets, their action spaces often exhibit\nsignificant heterogeneity due to distinct physical embodiment and control\ninterfaces for different robots, causing substantial challenges in developing\nembodied foundation models using cross-domain data. In this paper, we introduce\nUniAct, a new embodied foundation modeling framework operating in a tokenized\nUniversal Action Space. Our learned universal actions capture the generic\natomic behaviors across diverse robots by exploiting their shared structural\nfeatures, and enable enhanced cross-domain data utilization and\ncross-embodiment generalizations by eliminating the notorious heterogeneity.\nThe universal actions can be efficiently translated back to heterogeneous\nactionable commands by simply adding embodiment-specific details, from which\nfast adaptation to new robots becomes simple and straightforward. Our 0.5B\ninstantiation of UniAct outperforms 14X larger SOTA embodied foundation models\nin extensive evaluations on various real-world and simulation robots,\nshowcasing exceptional cross-embodiment control and adaptation capability,\nhighlighting the crucial benefit of adopting universal actions. Project page:\nhttps://github.com/2toinf/UniAct\n","authors":["Jinliang Zheng","Jianxiong Li","Dongxiu Liu","Yinan Zheng","Zhihao Wang","Zhonghong Ou","Yu Liu","Jingjing Liu","Ya-Qin Zhang","Xianyuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2501.10105v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2501.10100v1","updated":"2025-01-17T10:39:09Z","published":"2025-01-17T10:39:09Z","title":"Robotic World Model: A Neural Network Simulator for Robust Policy\n Optimization in Robotics","summary":" Learning robust and generalizable world models is crucial for enabling\nefficient and scalable robotic control in real-world environments. In this\nwork, we introduce a novel framework for learning world models that accurately\ncapture complex, partially observable, and stochastic dynamics. The proposed\nmethod employs a dual-autoregressive mechanism and self-supervised training to\nachieve reliable long-horizon predictions without relying on domain-specific\ninductive biases, ensuring adaptability across diverse robotic tasks. We\nfurther propose a policy optimization framework that leverages world models for\nefficient training in imagined environments and seamless deployment in\nreal-world systems. Through extensive experiments, our approach consistently\noutperforms state-of-the-art methods, demonstrating superior autoregressive\nprediction accuracy, robustness to noise, and generalization across\nmanipulation and locomotion tasks. Notably, policies trained with our method\nare successfully deployed on ANYmal D hardware in a zero-shot transfer,\nachieving robust performance with minimal sim-to-real performance loss. This\nwork advances model-based reinforcement learning by addressing the challenges\nof long-horizon prediction, error accumulation, and sim-to-real transfer. By\nproviding a scalable and robust framework, the introduced methods pave the way\nfor adaptive and efficient robotic systems in real-world applications.\n","authors":["Chenhao Li","Andreas Krause","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2501.10100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10074v1","updated":"2025-01-17T09:46:27Z","published":"2025-01-17T09:46:27Z","title":"SpatialCoT: Advancing Spatial Reasoning through Coordinate Alignment and\n Chain-of-Thought for Embodied Task Planning","summary":" Spatial reasoning is an essential problem in embodied AI research. Efforts to\nenhance spatial reasoning abilities through supplementary spatial data and\nfine-tuning have proven limited and ineffective when addressing complex\nembodied tasks, largely due to their dependence on language-based outputs.\nWhile some approaches have introduced a point-based action space to mitigate\nthis issue, they fall short in managing more intricate tasks within complex\nenvironments. This deficiency arises from their failure to fully exploit the\ninherent thinking and reasoning capabilities that are fundamental strengths of\nVision-Language Models (VLMs). To address these limitations, we propose a novel\napproach named SpatialCoT, specifically designed to bolster the spatial\nreasoning capabilities of VLMs. Our approach comprises two stages: spatial\ncoordinate bi-directional alignment, which aligns vision-language inputs with\nspatial coordinates, and chain-of-thought spatial grounding, which harnesses\nthe reasoning capabilities of language models for advanced spatial reasoning.\nWe evaluate SpatialCoT on challenging navigation and manipulation tasks, both\nin simulation and real-world settings. Experimental results demonstrate that\nour method significantly outperforms previous state-of-the-art approaches in\nboth tasks.\n","authors":["Yuecheng Liu","Dafeng Chi","Shiguang Wu","Zhanguang Zhang","Yaochen Hu","Lingfeng Zhang","Yingxue Zhang","Shuang Wu","Tongtong Cao","Guowei Huang","Guangjian Tian","Xingyue Quan","Jianye Hao","Yuzheng Zhuang"],"pdf_url":"https://arxiv.org/pdf/2501.10074v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.10066v1","updated":"2025-01-17T09:37:51Z","published":"2025-01-17T09:37:51Z","title":"A Comprehensive Insights into Drones: History, Classification,\n Architecture, Navigation, Applications, Challenges, and Future Trends","summary":" Unmanned Aerial Vehicles (UAVs), commonly known as Drones, are one of 21st\ncentury most transformative technologies. Emerging first for military use,\nadvancements in materials, electronics, and software have catapulted drones\ninto multipurpose tools for a wide range of industries. In this paper, we have\ncovered the history, taxonomy, architecture, navigation systems and branched\nactivities for the same. It explores important future trends like autonomous\nnavigation, AI integration, and obstacle avoidance systems, emphasizing how\nthey contribute to improving the efficiency and versatility of drones. It also\nlooks at the major challenges like technical, environmental, economic,\nregulatory and ethical, that limit the actual take-up of drones, as well as\ntrends that are likely to mitigate these obstacles in the future. This work\noffers a structured synthesis of existing studies and perspectives that enable\ninsights about how drones will transform agriculture, logistics, healthcare,\ndisaster management, and other areas, while also identifying new opportunities\nfor innovation and development.\n","authors":["Ruchita Singh","Sandeep Kumar"],"pdf_url":"https://arxiv.org/pdf/2501.10066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18251v2","updated":"2025-01-17T05:49:00Z","published":"2024-05-28T15:02:09Z","title":"Sensor-Based Distributionally Robust Control for Safe Robot Navigation\n in Dynamic Environments","summary":" We introduce a novel method for mobile robot navigation in dynamic, unknown\nenvironments, leveraging onboard sensing and distributionally robust\noptimization to impose probabilistic safety constraints. Our method introduces\na distributionally robust control barrier function (DR-CBF) that directly\nintegrates noisy sensor measurements and state estimates to define safety\nconstraints. This approach is applicable to a wide range of control-affine\ndynamics, generalizable to robots with complex geometries, and capable of\noperating at real-time control frequencies. Coupled with a control Lyapunov\nfunction (CLF) for path following, the proposed CLF-DR-CBF control synthesis\nmethod achieves safe, robust, and efficient navigation in challenging\nenvironments. We demonstrate the effectiveness and robustness of our approach\nfor safe autonomous navigation under uncertainty in simulations and real-world\nexperiments with differential-drive robots.\n","authors":["Kehan Long","Yinzhuang Yi","Zhirui Dai","Sylvia Herbert","Jorge Cortés","Nikolay Atanasov"],"pdf_url":"https://arxiv.org/pdf/2405.18251v2.pdf","comment":"Project page: https://existentialrobotics.org/DRO_Safe_Navigation"},{"id":"http://arxiv.org/abs/2501.09937v1","updated":"2025-01-17T03:20:39Z","published":"2025-01-17T03:20:39Z","title":"Adaptive Twisting Sliding Control for Integrated Attack UAV's Autopilot\n and Guidance","summary":" This paper investigates an adaptive sliding-mode control for an integrated\nUAV autopilot and guidance system. First, a two-dimensional mathematical model\nof the system is derived by considering the incorporated lateral dynamics and\nrelative kinematics of the UAV and its potential target of attack. Then, a\nsliding surface is derived utilizing the zero-effort miss distance. An adaptive\ntwisting sliding mode (ATSMC) algorithm is applied to the integrated system.\nSimulation and comparisons have been accomplished. The results show our\nproposed design performs well in interception precision, even with high\nnonlinearity, uncertainties, disturbances, and abrupt changes in the target's\nmovement, thanks to the adaptation strategy.\n","authors":["Minh Tu Nguyen","Van Truong Hoang","Manh Duong Phung","Van Hoa Doan"],"pdf_url":"https://arxiv.org/pdf/2501.09937v1.pdf","comment":"in Proceedings of the 2025 International Conference on Energy,\n Infrastructure and Environmental Research (EIER2025)"},{"id":"http://arxiv.org/abs/2501.09905v1","updated":"2025-01-17T01:32:18Z","published":"2025-01-17T01:32:18Z","title":"SLIM: Sim-to-Real Legged Instructive Manipulation via Long-Horizon\n Visuomotor Learning","summary":" We present a low-cost quadruped manipulation system that solves long-horizon\nreal-world tasks, trained by reinforcement learning purely in simulation. The\nsystem comprises 1) a hierarchical design of a high-level policy for\nvisual-mobile manipulation following instructions, and a low-level policy for\nquadruped movement and limb-control, 2) a progressive policy expansion approach\nfor solving the long-horizon task together with a teacher-student framework for\nefficient high-level training of the high-level visuomotor policy, and 3) a\nsuite of techniques for minimizing sim-to-real gaps.\n With budget-friendly but limited reliability and performance hardware, and\njust one wrist-mounted RGB camera, the entire system fully trained in\nsimulation achieves high success rates for long horizon tasks involving search,\nmove, grasp, and drop-into, with fluid sim-to-real transfer in a wide variety\nof indoor and outdoor scenes and lighting conditions.Extensive real-world\nevaluations show that on the long horizon mobile manipulation tasks, our system\nachieves good performance when transferred to real both in terms of task\nsuccess rate and execution efficiency. Finally, we discuss the necessity of our\nsim-to-real techniques for legged mobile manipulation, and show their ablation\nperformance.\n","authors":["Haichao Zhang","Haonan Yu","Le Zhao","Andrew Choi","Qinxun Bai","Yiqing Yang","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2501.09905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08396v4","updated":"2025-01-17T01:07:17Z","published":"2023-10-12T15:08:36Z","title":"Uncertainty-Aware Planning for Heterogeneous Robot Teams using Dynamic\n Topological Graphs and Mixed-Integer Programming","summary":" Multi-robot planning and coordination in uncertain environments is a\nfundamental computational challenge, since the belief space increases\nexponentially with the number of robots. In this paper, we address the problem\nof planning in uncertain environments with a heterogeneous robot team of fast\nscout vehicles for information gathering and more risk-averse carrier robots\nfrom which the scouts vehicles are deployed. To overcome the computational\nchallenges, we represent the environment and operational scenario using a\ntopological graph, where the parameters of the edge weight distributions vary\nwith the state of the robot team on the graph, and we formulate a\ncomputationally efficient mixed-integer program which removes the dependence on\nthe number of robots from its decision space. Our formulation results in the\ncapability to generate optimal multi-robot, long-horizon plans in seconds that\ncould otherwise be computationally intractable. Ultimately our approach enables\nreal-time re-planning, since the computation time is significantly faster than\nthe time to execute one step. We evaluate our approach in a scenario where the\nrobot team must traverse an environment while minimizing detection by observers\nin positions that are uncertain to the robot team. We demonstrate that our\napproach is computationally tractable, can improve performance in the presence\nof imperfect information, and can be adjusted for different risk profiles.\n","authors":["Cora A. Dimmig","Kevin C. Wolfe","Bradley Woosley","Marin Kobilarov","Joseph Moore"],"pdf_url":"https://arxiv.org/pdf/2310.08396v4.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2501.09898v1","updated":"2025-01-17T01:01:44Z","published":"2025-01-17T01:01:44Z","title":"FoundationStereo: Zero-Shot Stereo Matching","summary":" Tremendous progress has been made in deep stereo matching to excel on\nbenchmark datasets through per-domain fine-tuning. However, achieving strong\nzero-shot generalization - a hallmark of foundation models in other computer\nvision tasks - remains challenging for stereo matching. We introduce\nFoundationStereo, a foundation model for stereo depth estimation designed to\nachieve strong zero-shot generalization. To this end, we first construct a\nlarge-scale (1M stereo pairs) synthetic training dataset featuring large\ndiversity and high photorealism, followed by an automatic self-curation\npipeline to remove ambiguous samples. We then design a number of network\narchitecture components to enhance scalability, including a side-tuning feature\nbackbone that adapts rich monocular priors from vision foundation models to\nmitigate the sim-to-real gap, and long-range context reasoning for effective\ncost volume filtering. Together, these components lead to strong robustness and\naccuracy across domains, establishing a new standard in zero-shot stereo depth\nestimation.\n","authors":["Bowen Wen","Matthew Trepte","Joseph Aribido","Jan Kautz","Orazio Gallo","Stan Birchfield"],"pdf_url":"https://arxiv.org/pdf/2501.09898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.14593v2","updated":"2025-01-17T01:00:13Z","published":"2024-11-21T21:23:46Z","title":"A Systematic Study of Multi-Agent Deep Reinforcement Learning for Safe\n and Robust Autonomous Highway Ramp Entry","summary":" Vehicles today can drive themselves on highways and driverless robotaxis\noperate in major cities, with more sophisticated levels of autonomous driving\nexpected to be available and become more common in the future. Yet, technically\nspeaking, so-called \"Level 5\" (L5) operation, corresponding to full autonomy,\nhas not been achieved. For that to happen, functions such as fully autonomous\nhighway ramp entry must be available, and provide provably safe, and reliably\nrobust behavior to enable full autonomy. We present a systematic study of a\nhighway ramp function that controls the vehicles forward-moving actions to\nminimize collisions with the stream of highway traffic into which a merging\n(ego) vehicle enters. We take a game-theoretic multi-agent (MA) approach to\nthis problem and study the use of controllers based on deep reinforcement\nlearning (DRL). The virtual environment of the MA DRL uses self-play with\nsimulated data where merging vehicles safely learn to control longitudinal\nposition during a taper-type merge. The work presented in this paper extends\nexisting work by studying the interaction of more than two vehicles (agents)\nand does so by systematically expanding the road scene with additional traffic\nand ego vehicles. While previous work on the two-vehicle setting established\nthat collision-free controllers are theoretically impossible in fully\ndecentralized, non-coordinated environments, we empirically show that\ncontrollers learned using our approach are nearly ideal when measured against\nidealized optimal controllers.\n","authors":["Larry Schester","Luis E. Ortiz"],"pdf_url":"https://arxiv.org/pdf/2411.14593v2.pdf","comment":"9 pages, 9 figures; added support ack"},{"id":"http://arxiv.org/abs/2403.11396v2","updated":"2025-01-17T00:26:48Z","published":"2024-03-18T01:08:18Z","title":"Beyond Uncertainty: Risk-Aware Active View Acquisition for Safe Robot\n Navigation and 3D Scene Understanding with FisherRF","summary":" The active view acquisition problem has been extensively studied in the\ncontext of robot navigation using NeRF and 3D Gaussian Splatting. To enhance\nscene reconstruction efficiency and ensure robot safety, we propose the\nRisk-aware Environment Masking (RaEM) framework. RaEM leverages coherent risk\nmeasures to dynamically prioritize safety-critical regions of the unknown\nenvironment, guiding active view acquisition algorithms toward identifying the\nnext-best-view (NBV). Integrated with FisherRF, which selects the NBV by\nmaximizing expected information gain, our framework achieves a dual objective:\nimproving robot safety and increasing efficiency in risk-aware 3D scene\nreconstruction and understanding. Extensive high-fidelity experiments validate\nthe effectiveness of our approach, demonstrating its ability to establish a\nrobust and safety-focused framework for active robot exploration and 3D scene\nunderstanding.\n","authors":["Guangyi Liu","Wen Jiang","Boshu Lei","Vivek Pandey","Kostas Daniilidis","Nader Motee"],"pdf_url":"https://arxiv.org/pdf/2403.11396v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2501.10360v1","updated":"2025-01-17T18:59:55Z","published":"2025-01-17T18:59:55Z","title":"FaceXBench: Evaluating Multimodal LLMs on Face Understanding","summary":" Multimodal Large Language Models (MLLMs) demonstrate impressive\nproblem-solving abilities across a wide range of tasks and domains. However,\ntheir capacity for face understanding has not been systematically studied. To\naddress this gap, we introduce FaceXBench, a comprehensive benchmark designed\nto evaluate MLLMs on complex face understanding tasks. FaceXBench includes\n5,000 multimodal multiple-choice questions derived from 25 public datasets and\na newly created dataset, FaceXAPI. These questions cover 14 tasks across 6\nbroad categories, assessing MLLMs' face understanding abilities in bias and\nfairness, face authentication, recognition, analysis, localization and tool\nretrieval. Using FaceXBench, we conduct an extensive evaluation of 26\nopen-source MLLMs alongside 2 proprietary models, revealing the unique\nchallenges in complex face understanding tasks. We analyze the models across\nthree evaluation settings: zero-shot, in-context task description, and\nchain-of-thought prompting. Our detailed analysis reveals that current MLLMs,\nincluding advanced models like GPT-4o, and GeminiPro 1.5, show significant room\nfor improvement. We believe FaceXBench will be a crucial resource for\ndeveloping MLLMs equipped to perform sophisticated face understanding. Code:\nhttps://github.com/Kartik-3004/facexbench\n","authors":["Kartik Narayan","Vibashan VS","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2501.10360v1.pdf","comment":"Project Page: https://kartik-3004.github.io/facexbench/"},{"id":"http://arxiv.org/abs/2501.10357v1","updated":"2025-01-17T18:57:57Z","published":"2025-01-17T18:57:57Z","title":"Zero-Shot Monocular Scene Flow Estimation in the Wild","summary":" Large models have shown generalization across datasets for many low-level\nvision tasks, like depth estimation, but no such general models exist for scene\nflow. Even though scene flow has wide potential use, it is not used in practice\nbecause current predictive models do not generalize well. We identify three key\nchallenges and propose solutions for each.First, we create a method that\njointly estimates geometry and motion for accurate prediction. Second, we\nalleviate scene flow data scarcity with a data recipe that affords us 1M\nannotated training samples across diverse synthetic scenes. Third, we evaluate\ndifferent parameterizations for scene flow prediction and adopt a natural and\neffective parameterization. Our resulting model outperforms existing methods as\nwell as baselines built on large-scale models in terms of 3D end-point error,\nand shows zero-shot generalization to the casually captured videos from DAVIS\nand the robotic manipulation scenes from RoboTAP. Overall, our approach makes\nscene flow prediction more practical in-the-wild.\n","authors":["Yiqing Liang","Abhishek Badki","Hang Su","James Tompkin","Orazio Gallo"],"pdf_url":"https://arxiv.org/pdf/2501.10357v1.pdf","comment":"Project Website: https://research.nvidia.com/labs/zero_msf"},{"id":"http://arxiv.org/abs/2501.10343v1","updated":"2025-01-17T18:34:47Z","published":"2025-01-17T18:34:47Z","title":"3rd Workshop on Maritime Computer Vision (MaCVi) 2025: Challenge Results","summary":" The 3rd Workshop on Maritime Computer Vision (MaCVi) 2025 addresses maritime\ncomputer vision for Unmanned Surface Vehicles (USV) and underwater. This report\noffers a comprehensive overview of the findings from the challenges. We provide\nboth statistical and qualitative analyses, evaluating trends from over 700\nsubmissions. All datasets, evaluation code, and the leaderboard are available\nto the public at https://macvi.org/workshop/macvi25.\n","authors":["Benjamin Kiefer","Lojze Žust","Jon Muhovič","Matej Kristan","Janez Perš","Matija Teršek","Uma Mudenagudi Chaitra Desai","Arnold Wiliem","Marten Kreis","Nikhil Akalwadi","Yitong Quan","Zhiqiang Zhong","Zhe Zhang","Sujie Liu","Xuran Chen","Yang Yang","Matej Fabijanić","Fausto Ferreira","Seongju Lee","Junseok Lee","Kyoobin Lee","Shanliang Yao","Runwei Guan","Xiaoyu Huang","Yi Ni","Himanshu Kumar","Yuan Feng","Yi-Ching Cheng","Tzu-Yu Lin","Chia-Ming Lee","Chih-Chung Hsu","Jannik Sheikh","Andreas Michel","Wolfgang Gross","Martin Weinmann","Josip Šarić","Yipeng Lin","Xiang Yang","Nan Jiang","Yutang Lu","Fei Feng","Ali Awad","Evan Lucas","Ashraf Saleem","Ching-Heng Cheng","Yu-Fan Lin","Tzu-Yu Lin","Chih-Chung Hsu"],"pdf_url":"https://arxiv.org/pdf/2501.10343v1.pdf","comment":"Part of the MaCVi 2025 workshop"},{"id":"http://arxiv.org/abs/2412.19794v4","updated":"2025-01-17T18:18:21Z","published":"2024-12-27T18:47:05Z","title":"MVTamperBench: Evaluating Robustness of Vision-Language Models","summary":" Multimodal Large Language Models (MLLMs) have driven major advances in video\nunderstanding, yet their vulnerability to adversarial tampering and\nmanipulations remains underexplored. To address this gap, we introduce\nMVTamperBench, a benchmark that systematically evaluates MLLM robustness\nagainst five prevalent tampering techniques: rotation, masking, substitution,\nrepetition, and dropping. Built from 3.4K original videos-expanded to over 17K\ntampered clips spanning 19 video tasks.\n MVTamperBench challenges models to detect manipulations in spatial and\ntemporal coherence. We evaluate 45 recent MLLMs from 15+ model families,\nrevealing substantial variability in resilience across tampering types and\nshowing that larger parameter counts do not necessarily guarantee robustness.\nMVTamperBench sets a new benchmark for developing tamper-resilient MLLM in\nsafety-critical applications, including detecting clickbait, preventing harmful\ncontent distribution, and enforcing policies on media platforms. We release all\ncode and data to foster open research in trustworthy video understanding.\n Code: https://amitbcp.github.io/MVTamperBench/ Data:\nhttps://huggingface.co/datasets/Srikant86/MVTamperBench\n","authors":["Amit Agarwal","Srikant Panda","Angeline Charles","Bhargava Kumar","Hitesh Patel","Priyaranjan Pattnayak","Taki Hasan Rafi","Tejaswini Kumar","Dong-Kyu Chae"],"pdf_url":"https://arxiv.org/pdf/2412.19794v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10325v1","updated":"2025-01-17T17:56:52Z","published":"2025-01-17T17:56:52Z","title":"DiffStereo: High-Frequency Aware Diffusion Model for Stereo Image\n Restoration","summary":" Diffusion models (DMs) have achieved promising performance in image\nrestoration but haven't been explored for stereo images. The application of DM\nin stereo image restoration is confronted with a series of challenges. The need\nto reconstruct two images exacerbates DM's computational cost. Additionally,\nexisting latent DMs usually focus on semantic information and remove\nhigh-frequency details as redundancy during latent compression, which is\nprecisely what matters for image restoration. To address the above problems, we\npropose a high-frequency aware diffusion model, DiffStereo for stereo image\nrestoration as the first attempt at DM in this domain. Specifically, DiffStereo\nfirst learns latent high-frequency representations (LHFR) of HQ images. DM is\nthen trained in the learned space to estimate LHFR for stereo images, which are\nfused into a transformer-based stereo image restoration network providing\nbeneficial high-frequency information of corresponding HQ images. The\nresolution of LHFR is kept the same as input images, which preserves the\ninherent texture from distortion. And the compression in channels alleviates\nthe computational burden of DM. Furthermore, we devise a position encoding\nscheme when integrating the LHFR into the restoration network, enabling\ndistinctive guidance in different depths of the restoration network.\nComprehensive experiments verify that by combining generative DM and\ntransformer, DiffStereo achieves both higher reconstruction accuracy and better\nperceptual quality on stereo super-resolution, deblurring, and low-light\nenhancement compared with state-of-the-art methods.\n","authors":["Huiyun Cao","Yuan Shi","Bin Xia","Xiaoyu Jin","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2501.10325v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2501.10324v1","updated":"2025-01-17T17:56:27Z","published":"2025-01-17T17:56:27Z","title":"New Fashion Products Performance Forecasting: A Survey on Evolutions,\n Models and Emerging Trends","summary":" The fast fashion industry's insatiable demand for new styles and rapid\nproduction cycles has led to a significant environmental burden.\nOverproduction, excessive waste, and harmful chemicals have contributed to the\nnegative environmental impact of the industry. To mitigate these issues, a\nparadigm shift that prioritizes sustainability and efficiency is urgently\nneeded. Integrating learning-based predictive analytics into the fashion\nindustry represents a significant opportunity to address environmental\nchallenges and drive sustainable practices. By forecasting fashion trends and\noptimizing production, brands can reduce their ecological footprint while\nremaining competitive in a rapidly changing market. However, one of the key\nchallenges in forecasting fashion sales is the dynamic nature of consumer\npreferences. Fashion is acyclical, with trends constantly evolving and\nresurfacing. In addition, cultural changes and unexpected events can disrupt\nestablished patterns. This problem is also known as New Fashion Products\nPerformance Forecasting (NFPPF), and it has recently gained more and more\ninterest in the global research landscape. Given its multidisciplinary nature,\nthe field of NFPPF has been approached from many different angles. This\ncomprehensive survey wishes to provide an up-to-date overview that focuses on\nlearning-based NFPPF strategies. The survey is based on the Preferred Reporting\nItems for Systematic Reviews and Meta-Analyses (PRISMA) methodological flow,\nallowing for a systematic and complete literature review. In particular, we\npropose the first taxonomy that covers the learning panorama for NFPPF,\nexamining in detail the different methodologies used to increase the amount of\nmultimodal information, as well as the state-of-the-art available datasets.\nFinally, we discuss the challenges and future directions.\n","authors":["Andrea Avogaro","Luigi Capogrosso","Andrea Toaiari","Franco Fummi","Marco Cristani"],"pdf_url":"https://arxiv.org/pdf/2501.10324v1.pdf","comment":"Accepted at the Springer Nature Computer Science journal"},{"id":"http://arxiv.org/abs/2501.10318v1","updated":"2025-01-17T17:41:47Z","published":"2025-01-17T17:41:47Z","title":"HiMix: Reducing Computational Complexity in Large Vision-Language Models","summary":" Benefiting from recent advancements in large language models and modality\nalignment techniques, existing Large Vision-Language Models(LVLMs) have\nachieved prominent performance across a wide range of scenarios. However, the\nexcessive computational complexity limits the widespread use of these models in\npractical applications. We argue that one main bottleneck in computational\ncomplexity is caused by the involvement of redundant vision sequences in model\ncomputation. This is inspired by a reassessment of the efficiency of vision and\nlanguage information transmission in the language decoder of LVLMs. Then, we\npropose a novel hierarchical vision-language interaction mechanism called\nHierarchical Vision injection for Mixture Attention (HiMix). In HiMix, only the\nlanguage sequence undergoes full forward propagation, while the vision sequence\ninteracts with the language at specific stages within each language decoder\nlayer. It is striking that our approach significantly reduces computational\ncomplexity with minimal performance loss. Specifically, HiMix achieves a 10x\nreduction in the computational cost of the language decoder across multiple\nLVLM models while maintaining comparable performance. This highlights the\nadvantages of our method, and we hope our research brings new perspectives to\nthe field of vision-language understanding. Project Page:\nhttps://xuange923.github.io/HiMix\n","authors":["Xuange Zhang","Dengjie Li","Bo Liu","Zenghao Bao","Yao Zhou","Baisong Yang","Zhongying Liu","Yujie Zhong","Zheng Zhao","Tongtong Yuan"],"pdf_url":"https://arxiv.org/pdf/2501.10318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.09600v2","updated":"2025-01-17T17:07:31Z","published":"2025-01-16T15:22:06Z","title":"Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid\n Prototyping in Virtual Reality Applications","summary":" SLAM is a foundational technique with broad applications in robotics and\nAR/VR. SLAM simulations evaluate new concepts, but testing on\nresource-constrained devices, such as VR HMDs, faces challenges: high\ncomputational cost and restricted sensor data access. This work proposes a\nsparse framework using mesh geometry projections as features, which improves\nefficiency and circumvents direct sensor data access, advancing SLAM research\nas we demonstrate in VR and through numerical evaluation.\n","authors":["Carlos Augusto Pinheiro de Sousa","Heiko Hamann","Oliver Deussen"],"pdf_url":"https://arxiv.org/pdf/2501.09600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2501.10283v1","updated":"2025-01-17T16:26:24Z","published":"2025-01-17T16:26:24Z","title":"GSTAR: Gaussian Surface Tracking and Reconstruction","summary":" 3D Gaussian Splatting techniques have enabled efficient photo-realistic\nrendering of static scenes. Recent works have extended these approaches to\nsupport surface reconstruction and tracking. However, tracking dynamic surfaces\nwith 3D Gaussians remains challenging due to complex topology changes, such as\nsurfaces appearing, disappearing, or splitting. To address these challenges, we\npropose GSTAR, a novel method that achieves photo-realistic rendering, accurate\nsurface reconstruction, and reliable 3D tracking for general dynamic scenes\nwith changing topology. Given multi-view captures as input, GSTAR binds\nGaussians to mesh faces to represent dynamic objects. For surfaces with\nconsistent topology, GSTAR maintains the mesh topology and tracks the meshes\nusing Gaussians. In regions where topology changes, GSTAR adaptively unbinds\nGaussians from the mesh, enabling accurate registration and the generation of\nnew surfaces based on these optimized Gaussians. Additionally, we introduce a\nsurface-based scene flow method that provides robust initialization for\ntracking between frames. Experiments demonstrate that our method effectively\ntracks and reconstructs dynamic surfaces, enabling a range of applications. Our\nproject page with the code release is available at\nhttps://chengwei-zheng.github.io/GSTAR/.\n","authors":["Chengwei Zheng","Lixin Xue","Juan Zarate","Jie Song"],"pdf_url":"https://arxiv.org/pdf/2501.10283v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.09374v2","updated":"2025-01-17T15:52:06Z","published":"2024-10-12T05:35:27Z","title":"ESVO2: Direct Visual-Inertial Odometry with Stereo Event Cameras","summary":" Event-based visual odometry is a specific branch of visual Simultaneous\nLocalization and Mapping (SLAM) techniques, which aims at solving tracking and\nmapping subproblems (typically in parallel), by exploiting the special working\nprinciples of neuromorphic (i.e., event-based) cameras. Due to the\nmotion-dependent nature of event data, explicit data association (i.e., feature\nmatching) under large-baseline view-point changes is difficult to establish,\nmaking direct methods a more rational choice. However, state-of-the-art direct\nmethods are limited by the high computational complexity of the mapping\nsub-problem and the degeneracy of camera pose tracking in certain degrees of\nfreedom (DoF) in rotation. In this paper, we tackle these issues by building an\nevent-based stereo visual-inertial odometry system on top of a direct pipeline.\nSpecifically, to speed up the mapping operation, we propose an efficient\nstrategy for sampling contour points according to the local dynamics of events.\nThe mapping performance is also improved in terms of structure completeness and\nlocal smoothness by merging the temporal stereo and static stereo results. To\ncircumvent the degeneracy of camera pose tracking in recovering the pitch and\nyaw components of general 6-DoF motion, we introduce IMU measurements as motion\npriors via pre-integration. To this end, a compact back-end is proposed for\ncontinuously updating the IMU bias and predicting the linear velocity, enabling\nan accurate motion prediction for camera pose tracking. The resulting system\nscales well with modern high-resolution event cameras and leads to better\nglobal positioning accuracy in large-scale outdoor environments. Extensive\nevaluations on five publicly available datasets featuring different resolutions\nand scenarios justify the superior performance of the proposed system against\nfive state-of-the-art methods.\n","authors":["Junkai Niu","Sheng Zhong","Xiuyuan Lu","Shaojie Shen","Guillermo Gallego","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.09374v2.pdf","comment":null}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Robotics 20 + +
+
+
+ + ☆ DexForce: Extracting Force-informed Actions from Kinesthetic + Demonstrations for Dexterous Manipulation + + +
+ Imitation learning requires high-quality demonstrations consisting of +sequences of state-action pairs. For contact-rich dexterous manipulation tasks +that require fine-grained dexterity, the actions in these state-action pairs +must produce the right forces. Current widely-used methods for collecting +dexterous manipulation demonstrations are difficult to use for demonstrating +contact-rich tasks due to unintuitive human-to-robot motion retargeting and the +lack of direct haptic feedback. Motivated by this, we propose DexForce, a +method for collecting demonstrations of contact-rich dexterous manipulation. +DexForce leverages contact forces, measured during kinesthetic demonstrations, +to compute force-informed actions for policy learning. We use DexForce to +collect demonstrations for six tasks and show that policies trained on our +force-informed actions achieve an average success rate of 76% across all tasks. +In contrast, policies trained directly on actions that do not account for +contact forces have near-zero success rates. We also conduct a study ablating +the inclusion of force data in policy observations. We find that while using +force data never hurts policy performance, it helps the most for tasks that +require an advanced level of precision and coordination, like opening an +AirPods case and unscrewing a nut. + +
+
+ comment: Videos can be found here: + https://clairelc.github.io/dexforce.github.io/ +
+
+
+
+
+ + ☆ Deployment of an Aerial Multi-agent System for Automated Task Execution + in Large-scale Underground Mining Environments + + +
+ In this article, we present a framework for deploying an aerial multi-agent +system in large-scale subterranean environments with minimal infrastructure for +supporting multi-agent operations. The multi-agent objective is to optimally +and reactively allocate and execute inspection tasks in a mine, which are +entered by a mine operator on-the-fly. The assignment of currently available +tasks to the team of agents is accomplished through an auction-based system, +where the agents bid for available tasks, which are used by a central +auctioneer to optimally assigns tasks to agents. A mobile Wi-Fi mesh supports +inter-agent communication and bi-directional communication between the agents +and the task allocator, while the task execution is performed completely +infrastructure-free. Given a task to be accomplished, a reliable and modular +agent behavior is synthesized by generating behavior trees from a pool of agent +capabilities, using a back-chaining approach. The auction system in the +proposed framework is reactive and supports addition of new operator-specified +tasks on-the-go, at any point through a user-friendly operator interface. The +framework has been validated in a real underground mining environment using +three aerial agents, with several inspection locations spread in an environment +of almost 200 meters. The proposed framework can be utilized for missions +involving rapid inspection, gas detection, distributed sensing and mapping etc. +in a subterranean environment. The proposed framework and its field deployment +contributes towards furthering reliable automation in large-scale subterranean +environments to offload both routine and dangerous tasks from human operators +to autonomous aerial robots. + +
+
+ comment: Submitted to IEEE Transactions on Field Robotics +
+
+
+
+
+ + ☆ Tethered Variable Inertial Attitude Control Mechanisms through a Modular + Jumping Limbed Robot + + +
+ This paper presents the concept of a tethered variable inertial attitude +control mechanism for a modular jumping-limbed robot designed for planetary +exploration in low-gravity environments. The system, named SPLITTER, comprises +two sub-10 kg quadrupedal robots connected by a tether, capable of executing +successive jumping gaits and stabilizing in-flight using inertial morphing +technology. Through model predictive control (MPC), attitude control was +demonstrated by adjusting the limbs and tether length to modulate the system's +principal moments of inertia. Our results indicate that this control strategy +allows the robot to stabilize during flight phases without needing traditional +flywheel-based systems or relying on aerodynamics, making the approach +mass-efficient and ideal for small-scale planetary robots' successive jumps. +The paper outlines the dynamics, MPC formulation for inertial morphing, +actuator requirements, and simulation results, illustrating the potential of +agile exploration for small-scale rovers in low-gravity environments like the +Moon or asteroids. + +
+
+ comment: Proceeding to IEEE Aerospace Conference 2025 +
+
+
+
+
+ + ☆ Universal Actions for Enhanced Embodied Foundation Models + + +
+ Training on diverse, internet-scale data is a key factor in the success of +recent large foundation models. Yet, using the same recipe for building +embodied agents has faced noticeable difficulties. Despite the availability of +many crowd-sourced embodied datasets, their action spaces often exhibit +significant heterogeneity due to distinct physical embodiment and control +interfaces for different robots, causing substantial challenges in developing +embodied foundation models using cross-domain data. In this paper, we introduce +UniAct, a new embodied foundation modeling framework operating in a tokenized +Universal Action Space. Our learned universal actions capture the generic +atomic behaviors across diverse robots by exploiting their shared structural +features, and enable enhanced cross-domain data utilization and +cross-embodiment generalizations by eliminating the notorious heterogeneity. +The universal actions can be efficiently translated back to heterogeneous +actionable commands by simply adding embodiment-specific details, from which +fast adaptation to new robots becomes simple and straightforward. Our 0.5B +instantiation of UniAct outperforms 14X larger SOTA embodied foundation models +in extensive evaluations on various real-world and simulation robots, +showcasing exceptional cross-embodiment control and adaptation capability, +highlighting the crucial benefit of adopting universal actions. Project page: +https://github.com/2toinf/UniAct + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Robotic World Model: A Neural Network Simulator for Robust Policy + Optimization in Robotics + + +
+ Learning robust and generalizable world models is crucial for enabling +efficient and scalable robotic control in real-world environments. In this +work, we introduce a novel framework for learning world models that accurately +capture complex, partially observable, and stochastic dynamics. The proposed +method employs a dual-autoregressive mechanism and self-supervised training to +achieve reliable long-horizon predictions without relying on domain-specific +inductive biases, ensuring adaptability across diverse robotic tasks. We +further propose a policy optimization framework that leverages world models for +efficient training in imagined environments and seamless deployment in +real-world systems. Through extensive experiments, our approach consistently +outperforms state-of-the-art methods, demonstrating superior autoregressive +prediction accuracy, robustness to noise, and generalization across +manipulation and locomotion tasks. Notably, policies trained with our method +are successfully deployed on ANYmal D hardware in a zero-shot transfer, +achieving robust performance with minimal sim-to-real performance loss. This +work advances model-based reinforcement learning by addressing the challenges +of long-horizon prediction, error accumulation, and sim-to-real transfer. By +providing a scalable and robust framework, the introduced methods pave the way +for adaptive and efficient robotic systems in real-world applications. + +
+
+
+
+
+ + ☆ SpatialCoT: Advancing Spatial Reasoning through Coordinate Alignment and + Chain-of-Thought for Embodied Task Planning + + +
+ Spatial reasoning is an essential problem in embodied AI research. Efforts to +enhance spatial reasoning abilities through supplementary spatial data and +fine-tuning have proven limited and ineffective when addressing complex +embodied tasks, largely due to their dependence on language-based outputs. +While some approaches have introduced a point-based action space to mitigate +this issue, they fall short in managing more intricate tasks within complex +environments. This deficiency arises from their failure to fully exploit the +inherent thinking and reasoning capabilities that are fundamental strengths of +Vision-Language Models (VLMs). To address these limitations, we propose a novel +approach named SpatialCoT, specifically designed to bolster the spatial +reasoning capabilities of VLMs. Our approach comprises two stages: spatial +coordinate bi-directional alignment, which aligns vision-language inputs with +spatial coordinates, and chain-of-thought spatial grounding, which harnesses +the reasoning capabilities of language models for advanced spatial reasoning. +We evaluate SpatialCoT on challenging navigation and manipulation tasks, both +in simulation and real-world settings. Experimental results demonstrate that +our method significantly outperforms previous state-of-the-art approaches in +both tasks. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ☆ A Comprehensive Insights into Drones: History, Classification, + Architecture, Navigation, Applications, Challenges, and Future Trends + + +
+ Unmanned Aerial Vehicles (UAVs), commonly known as Drones, are one of 21st +century most transformative technologies. Emerging first for military use, +advancements in materials, electronics, and software have catapulted drones +into multipurpose tools for a wide range of industries. In this paper, we have +covered the history, taxonomy, architecture, navigation systems and branched +activities for the same. It explores important future trends like autonomous +navigation, AI integration, and obstacle avoidance systems, emphasizing how +they contribute to improving the efficiency and versatility of drones. It also +looks at the major challenges like technical, environmental, economic, +regulatory and ethical, that limit the actual take-up of drones, as well as +trends that are likely to mitigate these obstacles in the future. This work +offers a structured synthesis of existing studies and perspectives that enable +insights about how drones will transform agriculture, logistics, healthcare, +disaster management, and other areas, while also identifying new opportunities +for innovation and development. + +
+
+
+
+
+ + ☆ Adaptive Twisting Sliding Control for Integrated Attack UAV's Autopilot + and Guidance + + +
+ This paper investigates an adaptive sliding-mode control for an integrated +UAV autopilot and guidance system. First, a two-dimensional mathematical model +of the system is derived by considering the incorporated lateral dynamics and +relative kinematics of the UAV and its potential target of attack. Then, a +sliding surface is derived utilizing the zero-effort miss distance. An adaptive +twisting sliding mode (ATSMC) algorithm is applied to the integrated system. +Simulation and comparisons have been accomplished. The results show our +proposed design performs well in interception precision, even with high +nonlinearity, uncertainties, disturbances, and abrupt changes in the target's +movement, thanks to the adaptation strategy. + +
+
+ comment: in Proceedings of the 2025 International Conference on Energy, + Infrastructure and Environmental Research (EIER2025) +
+
+
+
+
+ + SLIM: Sim-to-Real Legged Instructive Manipulation via Long-Horizon + Visuomotor Learning + + +
+ We present a low-cost quadruped manipulation system that solves long-horizon +real-world tasks, trained by reinforcement learning purely in simulation. The +system comprises 1) a hierarchical design of a high-level policy for +visual-mobile manipulation following instructions, and a low-level policy for +quadruped movement and limb-control, 2) a progressive policy expansion approach +for solving the long-horizon task together with a teacher-student framework for +efficient high-level training of the high-level visuomotor policy, and 3) a +suite of techniques for minimizing sim-to-real gaps. + With budget-friendly but limited reliability and performance hardware, and +just one wrist-mounted RGB camera, the entire system fully trained in +simulation achieves high success rates for long horizon tasks involving search, +move, grasp, and drop-into, with fluid sim-to-real transfer in a wide variety +of indoor and outdoor scenes and lighting conditions.Extensive real-world +evaluations show that on the long horizon mobile manipulation tasks, our system +achieves good performance when transferred to real both in terms of task +success rate and execution efficiency. Finally, we discuss the necessity of our +sim-to-real techniques for legged mobile manipulation, and show their ablation +performance. + +
+
+
+
+
+ + ☆ FoundationStereo: Zero-Shot Stereo Matching + + +
+ Tremendous progress has been made in deep stereo matching to excel on +benchmark datasets through per-domain fine-tuning. However, achieving strong +zero-shot generalization - a hallmark of foundation models in other computer +vision tasks - remains challenging for stereo matching. We introduce +FoundationStereo, a foundation model for stereo depth estimation designed to +achieve strong zero-shot generalization. To this end, we first construct a +large-scale (1M stereo pairs) synthetic training dataset featuring large +diversity and high photorealism, followed by an automatic self-curation +pipeline to remove ambiguous samples. We then design a number of network +architecture components to enhance scalability, including a side-tuning feature +backbone that adapts rich monocular priors from vision foundation models to +mitigate the sim-to-real gap, and long-range context reasoning for effective +cost volume filtering. Together, these components lead to strong robustness and +accuracy across domains, establishing a new standard in zero-shot stereo depth +estimation. + +
+
+
+
+
+ + ♻ ☆ Optimal Virtual Model Control for Robotics: Design and Tuning of + Passivity-Based Controllers + + +
+ Passivity-based control is a cornerstone of control theory and an established +design approach in robotics. Its strength is based on the passivity theorem, +which provides a powerful interconnection framework for robotics. However, the +design of passivity-based controllers and their optimal tuning remain +challenging. We propose here an intuitive design approach for fully actuated +robots, where the control action is determined by a `virtual-mechanism' as in +classical virtual model control. The result is a robot whose controlled +behavior can be understood in terms of physics. We achieve optimal tuning by +applying algorithmic differentiation to ODE simulations of the rigid body +dynamics. Overall, this leads to a flexible design and optimization approach: +stability is proven by passivity of the virtual mechanism, while performance is +obtained by optimization using algorithmic differentiation. + +
+
+ comment: 14 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid + Prototyping in Virtual Reality Applications + + +
+ SLAM is a foundational technique with broad applications in robotics and +AR/VR. SLAM simulations evaluate new concepts, but testing on +resource-constrained devices, such as VR HMDs, faces challenges: high +computational cost and restricted sensor data access. This work proposes a +sparse framework using mesh geometry projections as features, which improves +efficiency and circumvents direct sensor data access, advancing SLAM research +as we demonstrate in VR and through numerical evaluation. + +
+
+
+
+
+ + ♻ ☆ STPOTR: Simultaneous Human Trajectory and Pose Prediction Using a + Non-Autoregressive Transformer for Robot Following Ahead + + +
+ In this paper, we develop a neural network model to predict future human +motion from an observed human motion history. We propose a non-autoregressive +transformer architecture to leverage its parallel nature for easier training +and fast, accurate predictions at test time. The proposed architecture divides +human motion prediction into two parts: 1) the human trajectory, which is the +hip joint 3D position over time and 2) the human pose which is the all other +joints 3D positions over time with respect to a fixed hip joint. We propose to +make the two predictions simultaneously, as the shared representation can +improve the model performance. Therefore, the model consists of two sets of +encoders and decoders. First, a multi-head attention module applied to encoder +outputs improves human trajectory. Second, another multi-head self-attention +module applied to encoder outputs concatenated with decoder outputs facilitates +learning of temporal dependencies. Our model is well-suited for robotic +applications in terms of test accuracy and speed, and compares favorably with +respect to state-of-the-art methods. We demonstrate the real-world +applicability of our work via the Robot Follow-Ahead task, a challenging yet +practical case study for our proposed model. + +
+
+
+
+
+ + ♻ ☆ ESVO2: Direct Visual-Inertial Odometry with Stereo Event Cameras + + +
+ Event-based visual odometry is a specific branch of visual Simultaneous +Localization and Mapping (SLAM) techniques, which aims at solving tracking and +mapping subproblems (typically in parallel), by exploiting the special working +principles of neuromorphic (i.e., event-based) cameras. Due to the +motion-dependent nature of event data, explicit data association (i.e., feature +matching) under large-baseline view-point changes is difficult to establish, +making direct methods a more rational choice. However, state-of-the-art direct +methods are limited by the high computational complexity of the mapping +sub-problem and the degeneracy of camera pose tracking in certain degrees of +freedom (DoF) in rotation. In this paper, we tackle these issues by building an +event-based stereo visual-inertial odometry system on top of a direct pipeline. +Specifically, to speed up the mapping operation, we propose an efficient +strategy for sampling contour points according to the local dynamics of events. +The mapping performance is also improved in terms of structure completeness and +local smoothness by merging the temporal stereo and static stereo results. To +circumvent the degeneracy of camera pose tracking in recovering the pitch and +yaw components of general 6-DoF motion, we introduce IMU measurements as motion +priors via pre-integration. To this end, a compact back-end is proposed for +continuously updating the IMU bias and predicting the linear velocity, enabling +an accurate motion prediction for camera pose tracking. The resulting system +scales well with modern high-resolution event cameras and leads to better +global positioning accuracy in large-scale outdoor environments. Extensive +evaluations on five publicly available datasets featuring different resolutions +and scenarios justify the superior performance of the proposed system against +five state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ BILTS: A Bi-Invariant Similarity Measure for Robust Object Trajectory + Recognition under Reference Frame Variations + + +
+ When similar object motions are performed in diverse contexts but are meant +to be recognized under a single classification, these contextual variations act +as disturbances that negatively affect accurate motion recognition. In this +paper, we focus on contextual variations caused by reference frame variations. +To robustly deal with these variations, similarity measures have been +introduced that compare object motion trajectories in a context-invariant +manner. However, most are highly sensitive to noise near singularities, where +the measure is not uniquely defined, and lack bi-invariance (invariance to both +world and body frame variations). To address these issues, we propose the novel +\textit{Bi-Invariant Local Trajectory-Shape Similarity} (BILTS) measure. +Compared to other measures, the BILTS measure uniquely offers bi-invariance, +boundedness, and third-order shape identity. Aimed at practical +implementations, we devised a discretized and regularized version of the BILTS +measure which shows exceptional robustness to singularities. This is +demonstrated through rigorous recognition experiments using multiple datasets. +On average, BILTS attained the highest recognition ratio and least sensitivity +to contextual variations compared to other invariant object motion similarity +measures. We believe that the BILTS measure is a valuable tool for recognizing +motions performed in diverse contexts and has potential in other applications, +including the recognition, segmentation, and adaptation of both motion and +force trajectories. + +
+
+ comment: This work has been submitted as a regular research paper for + consideration in the Journal of Intelligent & Robotic Systems. The content in + this preprint is identical to the version submitted for peer review, except + for formatting differences required by the journal +
+
+
+
+
+ + ♻ ☆ Safe Interval Randomized Path Planning For Manipulators + + +
+ Planning safe paths in 3D workspace for high DoF robotic systems, such as +manipulators, is a challenging problem, especially when the environment is +populated with the dynamic obstacles that need to be avoided. In this case the +time dimension should be taken into account that further increases the +complexity of planning. To mitigate this issue we suggest to combine +safe-interval path planning (a prominent technique in heuristic search) with +the randomized planning, specifically, with the bidirectional rapidly-exploring +random trees (RRT-Connect) - a fast and efficient algorithm for +high-dimensional planning. Leveraging a dedicated technique of fast computation +of the safe intervals we end up with an efficient planner dubbed SI-RRT. We +compare it with the state of the art and show that SI-RRT consistently +outperforms the competitors both in runtime and solution cost. + Our implementation of SI-RRT is publicly available at +https://github.com/PathPlanning/ManipulationPlanning-SI-RRT + +
+
+ comment: Submitted to The 35th International Conference on Automated Planning + and Scheduling (ICAPS 2025) +
+
+
+
+
+ + ♻ ☆ Sensor-Based Distributionally Robust Control for Safe Robot Navigation + in Dynamic Environments + + +
+ We introduce a novel method for mobile robot navigation in dynamic, unknown +environments, leveraging onboard sensing and distributionally robust +optimization to impose probabilistic safety constraints. Our method introduces +a distributionally robust control barrier function (DR-CBF) that directly +integrates noisy sensor measurements and state estimates to define safety +constraints. This approach is applicable to a wide range of control-affine +dynamics, generalizable to robots with complex geometries, and capable of +operating at real-time control frequencies. Coupled with a control Lyapunov +function (CLF) for path following, the proposed CLF-DR-CBF control synthesis +method achieves safe, robust, and efficient navigation in challenging +environments. We demonstrate the effectiveness and robustness of our approach +for safe autonomous navigation under uncertainty in simulations and real-world +experiments with differential-drive robots. + +
+
+ comment: Project page: https://existentialrobotics.org/DRO_Safe_Navigation +
+
+
+
+
+ + ♻ ☆ Uncertainty-Aware Planning for Heterogeneous Robot Teams using Dynamic + Topological Graphs and Mixed-Integer Programming + + +
+ Multi-robot planning and coordination in uncertain environments is a +fundamental computational challenge, since the belief space increases +exponentially with the number of robots. In this paper, we address the problem +of planning in uncertain environments with a heterogeneous robot team of fast +scout vehicles for information gathering and more risk-averse carrier robots +from which the scouts vehicles are deployed. To overcome the computational +challenges, we represent the environment and operational scenario using a +topological graph, where the parameters of the edge weight distributions vary +with the state of the robot team on the graph, and we formulate a +computationally efficient mixed-integer program which removes the dependence on +the number of robots from its decision space. Our formulation results in the +capability to generate optimal multi-robot, long-horizon plans in seconds that +could otherwise be computationally intractable. Ultimately our approach enables +real-time re-planning, since the computation time is significantly faster than +the time to execute one step. We evaluate our approach in a scenario where the +robot team must traverse an environment while minimizing detection by observers +in positions that are uncertain to the robot team. We demonstrate that our +approach is computationally tractable, can improve performance in the presence +of imperfect information, and can be adjusted for different risk profiles. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ♻ ☆ A Systematic Study of Multi-Agent Deep Reinforcement Learning for Safe + and Robust Autonomous Highway Ramp Entry + + +
+ Vehicles today can drive themselves on highways and driverless robotaxis +operate in major cities, with more sophisticated levels of autonomous driving +expected to be available and become more common in the future. Yet, technically +speaking, so-called "Level 5" (L5) operation, corresponding to full autonomy, +has not been achieved. For that to happen, functions such as fully autonomous +highway ramp entry must be available, and provide provably safe, and reliably +robust behavior to enable full autonomy. We present a systematic study of a +highway ramp function that controls the vehicles forward-moving actions to +minimize collisions with the stream of highway traffic into which a merging +(ego) vehicle enters. We take a game-theoretic multi-agent (MA) approach to +this problem and study the use of controllers based on deep reinforcement +learning (DRL). The virtual environment of the MA DRL uses self-play with +simulated data where merging vehicles safely learn to control longitudinal +position during a taper-type merge. The work presented in this paper extends +existing work by studying the interaction of more than two vehicles (agents) +and does so by systematically expanding the road scene with additional traffic +and ego vehicles. While previous work on the two-vehicle setting established +that collision-free controllers are theoretically impossible in fully +decentralized, non-coordinated environments, we empirically show that +controllers learned using our approach are nearly ideal when measured against +idealized optimal controllers. + +
+
+ comment: 9 pages, 9 figures; added support ack +
+
+
+
+
+ + ♻ ☆ Beyond Uncertainty: Risk-Aware Active View Acquisition for Safe Robot + Navigation and 3D Scene Understanding with FisherRF + + +
+ The active view acquisition problem has been extensively studied in the +context of robot navigation using NeRF and 3D Gaussian Splatting. To enhance +scene reconstruction efficiency and ensure robot safety, we propose the +Risk-aware Environment Masking (RaEM) framework. RaEM leverages coherent risk +measures to dynamically prioritize safety-critical regions of the unknown +environment, guiding active view acquisition algorithms toward identifying the +next-best-view (NBV). Integrated with FisherRF, which selects the NBV by +maximizing expected information gain, our framework achieves a dual objective: +improving robot safety and increasing efficiency in risk-aware 3D scene +reconstruction and understanding. Extensive high-fidelity experiments validate +the effectiveness of our approach, demonstrating its ability to establish a +robust and safety-focused framework for active robot exploration and 3D scene +understanding. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 10 + +
+
+
+ + ☆ FaceXBench: Evaluating Multimodal LLMs on Face Understanding + + +
+ Multimodal Large Language Models (MLLMs) demonstrate impressive +problem-solving abilities across a wide range of tasks and domains. However, +their capacity for face understanding has not been systematically studied. To +address this gap, we introduce FaceXBench, a comprehensive benchmark designed +to evaluate MLLMs on complex face understanding tasks. FaceXBench includes +5,000 multimodal multiple-choice questions derived from 25 public datasets and +a newly created dataset, FaceXAPI. These questions cover 14 tasks across 6 +broad categories, assessing MLLMs' face understanding abilities in bias and +fairness, face authentication, recognition, analysis, localization and tool +retrieval. Using FaceXBench, we conduct an extensive evaluation of 26 +open-source MLLMs alongside 2 proprietary models, revealing the unique +challenges in complex face understanding tasks. We analyze the models across +three evaluation settings: zero-shot, in-context task description, and +chain-of-thought prompting. Our detailed analysis reveals that current MLLMs, +including advanced models like GPT-4o, and GeminiPro 1.5, show significant room +for improvement. We believe FaceXBench will be a crucial resource for +developing MLLMs equipped to perform sophisticated face understanding. Code: +https://github.com/Kartik-3004/facexbench + +
+
+ comment: Project Page: https://kartik-3004.github.io/facexbench/ +
+
+
+
+
+ + ☆ Zero-Shot Monocular Scene Flow Estimation in the Wild + + +
+ Large models have shown generalization across datasets for many low-level +vision tasks, like depth estimation, but no such general models exist for scene +flow. Even though scene flow has wide potential use, it is not used in practice +because current predictive models do not generalize well. We identify three key +challenges and propose solutions for each.First, we create a method that +jointly estimates geometry and motion for accurate prediction. Second, we +alleviate scene flow data scarcity with a data recipe that affords us 1M +annotated training samples across diverse synthetic scenes. Third, we evaluate +different parameterizations for scene flow prediction and adopt a natural and +effective parameterization. Our resulting model outperforms existing methods as +well as baselines built on large-scale models in terms of 3D end-point error, +and shows zero-shot generalization to the casually captured videos from DAVIS +and the robotic manipulation scenes from RoboTAP. Overall, our approach makes +scene flow prediction more practical in-the-wild. + +
+
+ comment: Project Website: https://research.nvidia.com/labs/zero_msf +
+
+
+
+
+ + ☆ 3rd Workshop on Maritime Computer Vision (MaCVi) 2025: Challenge Results + + +
+ The 3rd Workshop on Maritime Computer Vision (MaCVi) 2025 addresses maritime +computer vision for Unmanned Surface Vehicles (USV) and underwater. This report +offers a comprehensive overview of the findings from the challenges. We provide +both statistical and qualitative analyses, evaluating trends from over 700 +submissions. All datasets, evaluation code, and the leaderboard are available +to the public at https://macvi.org/workshop/macvi25. + +
+
+ comment: Part of the MaCVi 2025 workshop +
+
+
+
+
+ + ☆ DiffStereo: High-Frequency Aware Diffusion Model for Stereo Image + Restoration + + +
+ Diffusion models (DMs) have achieved promising performance in image +restoration but haven't been explored for stereo images. The application of DM +in stereo image restoration is confronted with a series of challenges. The need +to reconstruct two images exacerbates DM's computational cost. Additionally, +existing latent DMs usually focus on semantic information and remove +high-frequency details as redundancy during latent compression, which is +precisely what matters for image restoration. To address the above problems, we +propose a high-frequency aware diffusion model, DiffStereo for stereo image +restoration as the first attempt at DM in this domain. Specifically, DiffStereo +first learns latent high-frequency representations (LHFR) of HQ images. DM is +then trained in the learned space to estimate LHFR for stereo images, which are +fused into a transformer-based stereo image restoration network providing +beneficial high-frequency information of corresponding HQ images. The +resolution of LHFR is kept the same as input images, which preserves the +inherent texture from distortion. And the compression in channels alleviates +the computational burden of DM. Furthermore, we devise a position encoding +scheme when integrating the LHFR into the restoration network, enabling +distinctive guidance in different depths of the restoration network. +Comprehensive experiments verify that by combining generative DM and +transformer, DiffStereo achieves both higher reconstruction accuracy and better +perceptual quality on stereo super-resolution, deblurring, and low-light +enhancement compared with state-of-the-art methods. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ New Fashion Products Performance Forecasting: A Survey on Evolutions, + Models and Emerging Trends + + +
+ The fast fashion industry's insatiable demand for new styles and rapid +production cycles has led to a significant environmental burden. +Overproduction, excessive waste, and harmful chemicals have contributed to the +negative environmental impact of the industry. To mitigate these issues, a +paradigm shift that prioritizes sustainability and efficiency is urgently +needed. Integrating learning-based predictive analytics into the fashion +industry represents a significant opportunity to address environmental +challenges and drive sustainable practices. By forecasting fashion trends and +optimizing production, brands can reduce their ecological footprint while +remaining competitive in a rapidly changing market. However, one of the key +challenges in forecasting fashion sales is the dynamic nature of consumer +preferences. Fashion is acyclical, with trends constantly evolving and +resurfacing. In addition, cultural changes and unexpected events can disrupt +established patterns. This problem is also known as New Fashion Products +Performance Forecasting (NFPPF), and it has recently gained more and more +interest in the global research landscape. Given its multidisciplinary nature, +the field of NFPPF has been approached from many different angles. This +comprehensive survey wishes to provide an up-to-date overview that focuses on +learning-based NFPPF strategies. The survey is based on the Preferred Reporting +Items for Systematic Reviews and Meta-Analyses (PRISMA) methodological flow, +allowing for a systematic and complete literature review. In particular, we +propose the first taxonomy that covers the learning panorama for NFPPF, +examining in detail the different methodologies used to increase the amount of +multimodal information, as well as the state-of-the-art available datasets. +Finally, we discuss the challenges and future directions. + +
+
+ comment: Accepted at the Springer Nature Computer Science journal +
+
+
+
+
+ + ☆ HiMix: Reducing Computational Complexity in Large Vision-Language Models + + +
+ Benefiting from recent advancements in large language models and modality +alignment techniques, existing Large Vision-Language Models(LVLMs) have +achieved prominent performance across a wide range of scenarios. However, the +excessive computational complexity limits the widespread use of these models in +practical applications. We argue that one main bottleneck in computational +complexity is caused by the involvement of redundant vision sequences in model +computation. This is inspired by a reassessment of the efficiency of vision and +language information transmission in the language decoder of LVLMs. Then, we +propose a novel hierarchical vision-language interaction mechanism called +Hierarchical Vision injection for Mixture Attention (HiMix). In HiMix, only the +language sequence undergoes full forward propagation, while the vision sequence +interacts with the language at specific stages within each language decoder +layer. It is striking that our approach significantly reduces computational +complexity with minimal performance loss. Specifically, HiMix achieves a 10x +reduction in the computational cost of the language decoder across multiple +LVLM models while maintaining comparable performance. This highlights the +advantages of our method, and we hope our research brings new perspectives to +the field of vision-language understanding. Project Page: +https://xuange923.github.io/HiMix + +
+
+
+
+
+ + ☆ GSTAR: Gaussian Surface Tracking and Reconstruction + + +
+ 3D Gaussian Splatting techniques have enabled efficient photo-realistic +rendering of static scenes. Recent works have extended these approaches to +support surface reconstruction and tracking. However, tracking dynamic surfaces +with 3D Gaussians remains challenging due to complex topology changes, such as +surfaces appearing, disappearing, or splitting. To address these challenges, we +propose GSTAR, a novel method that achieves photo-realistic rendering, accurate +surface reconstruction, and reliable 3D tracking for general dynamic scenes +with changing topology. Given multi-view captures as input, GSTAR binds +Gaussians to mesh faces to represent dynamic objects. For surfaces with +consistent topology, GSTAR maintains the mesh topology and tracks the meshes +using Gaussians. In regions where topology changes, GSTAR adaptively unbinds +Gaussians from the mesh, enabling accurate registration and the generation of +new surfaces based on these optimized Gaussians. Additionally, we introduce a +surface-based scene flow method that provides robust initialization for +tracking between frames. Experiments demonstrate that our method effectively +tracks and reconstructs dynamic surfaces, enabling a range of applications. Our +project page with the code release is available at +https://chengwei-zheng.github.io/GSTAR/. + +
+
+
+
+
+ + ♻ ☆ MVTamperBench: Evaluating Robustness of Vision-Language Models + + +
+ Multimodal Large Language Models (MLLMs) have driven major advances in video +understanding, yet their vulnerability to adversarial tampering and +manipulations remains underexplored. To address this gap, we introduce +MVTamperBench, a benchmark that systematically evaluates MLLM robustness +against five prevalent tampering techniques: rotation, masking, substitution, +repetition, and dropping. Built from 3.4K original videos-expanded to over 17K +tampered clips spanning 19 video tasks. + MVTamperBench challenges models to detect manipulations in spatial and +temporal coherence. We evaluate 45 recent MLLMs from 15+ model families, +revealing substantial variability in resilience across tampering types and +showing that larger parameter counts do not necessarily guarantee robustness. +MVTamperBench sets a new benchmark for developing tamper-resilient MLLM in +safety-critical applications, including detecting clickbait, preventing harmful +content distribution, and enforcing policies on media platforms. We release all +code and data to foster open research in trustworthy video understanding. + Code: https://amitbcp.github.io/MVTamperBench/ Data: +https://huggingface.co/datasets/Srikant86/MVTamperBench + +
+
+
+
+
+ + ♻ ☆ Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid + Prototyping in Virtual Reality Applications + + +
+ SLAM is a foundational technique with broad applications in robotics and +AR/VR. SLAM simulations evaluate new concepts, but testing on +resource-constrained devices, such as VR HMDs, faces challenges: high +computational cost and restricted sensor data access. This work proposes a +sparse framework using mesh geometry projections as features, which improves +efficiency and circumvents direct sensor data access, advancing SLAM research +as we demonstrate in VR and through numerical evaluation. + +
+
+
+
+
+ + ♻ ☆ ESVO2: Direct Visual-Inertial Odometry with Stereo Event Cameras + + +
+ Event-based visual odometry is a specific branch of visual Simultaneous +Localization and Mapping (SLAM) techniques, which aims at solving tracking and +mapping subproblems (typically in parallel), by exploiting the special working +principles of neuromorphic (i.e., event-based) cameras. Due to the +motion-dependent nature of event data, explicit data association (i.e., feature +matching) under large-baseline view-point changes is difficult to establish, +making direct methods a more rational choice. However, state-of-the-art direct +methods are limited by the high computational complexity of the mapping +sub-problem and the degeneracy of camera pose tracking in certain degrees of +freedom (DoF) in rotation. In this paper, we tackle these issues by building an +event-based stereo visual-inertial odometry system on top of a direct pipeline. +Specifically, to speed up the mapping operation, we propose an efficient +strategy for sampling contour points according to the local dynamics of events. +The mapping performance is also improved in terms of structure completeness and +local smoothness by merging the temporal stereo and static stereo results. To +circumvent the degeneracy of camera pose tracking in recovering the pitch and +yaw components of general 6-DoF motion, we introduce IMU measurements as motion +priors via pre-integration. To this end, a compact back-end is proposed for +continuously updating the IMU bias and predicting the linear velocity, enabling +an accurate motion prediction for camera pose tracking. The resulting system +scales well with modern high-resolution event cameras and leads to better +global positioning accuracy in large-scale outdoor environments. Extensive +evaluations on five publicly available datasets featuring different resolutions +and scenarios justify the superior performance of the proposed system against +five state-of-the-art methods. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 30 + +
+
+
+ + ☆ Distilling Multi-modal Large Language Models for Autonomous Driving + + +
+ Autonomous driving demands safe motion planning, especially in critical +"long-tail" scenarios. Recent end-to-end autonomous driving systems leverage +large language models (LLMs) as planners to improve generalizability to rare +events. However, using LLMs at test time introduces high computational costs. +To address this, we propose DiMA, an end-to-end autonomous driving system that +maintains the efficiency of an LLM-free (or vision-based) planner while +leveraging the world knowledge of an LLM. DiMA distills the information from a +multi-modal LLM to a vision-based end-to-end planner through a set of specially +designed surrogate tasks. Under a joint training strategy, a scene encoder +common to both networks produces structured representations that are +semantically grounded as well as aligned to the final planning objective. +Notably, the LLM is optional at inference, enabling robust planning without +compromising on efficiency. Training with DiMA results in a 37% reduction in +the L2 trajectory error and an 80% reduction in the collision rate of the +vision-based planner, as well as a 44% trajectory error reduction in longtail +scenarios. DiMA also achieves state-of-the-art performance on the nuScenes +planning benchmark. + +
+
+
+
+
+ + ☆ FAST: Efficient Action Tokenization for Vision-Language-Action Models + + +
+ Autoregressive sequence models, such as Transformer-based vision-language +action (VLA) policies, can be tremendously effective for capturing complex and +generalizable robotic behaviors. However, such models require us to choose a +tokenization of our continuous action signals, which determines how the +discrete symbols predicted by the model map to continuous robot actions. We +find that current approaches for robot action tokenization, based on simple +per-dimension, per-timestep binning schemes, typically perform poorly when +learning dexterous skills from high-frequency robot data. To address this +challenge, we propose a new compression-based tokenization scheme for robot +actions, based on the discrete cosine transform. Our tokenization approach, +Frequency-space Action Sequence Tokenization (FAST), enables us to train +autoregressive VLAs for highly dexterous and high-frequency tasks where +standard discretization methods fail completely. Based on FAST, we release +FAST+, a universal robot action tokenizer, trained on 1M real robot action +trajectories. It can be used as a black-box tokenizer for a wide range of robot +action sequences, with diverse action spaces and control frequencies. Finally, +we show that, when combined with the pi0 VLA, our method can scale to training +on 10k hours of robot data and match the performance of diffusion VLAs, while +reducing training time by up to 5x. + +
+
+ comment: Website: https://www.pi.website/research/fast +
+
+
+
+
+ + ☆ FLOL: Fast Baselines for Real-World Low-Light Enhancement + + +
+ Low-Light Image Enhancement (LLIE) is a key task in computational photography +and imaging. The problem of enhancing images captured during night or in dark +environments has been well-studied in the image signal processing literature. +However, current deep learning-based solutions struggle with efficiency and +robustness in real-world scenarios (e.g. scenes with noise, saturated pixels, +bad illumination). We propose a lightweight neural network that combines image +processing in the frequency and spatial domains. Our method, FLOL+, is one of +the fastest models for this task, achieving state-of-the-art results on popular +real scenes datasets such as LOL and LSRW. Moreover, we are able to process +1080p images under 12ms. Code and models at https://github.com/cidautai/FLOL + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ CoNav Chair: Design of a ROS-based Smart Wheelchair for Shared Control + Navigation in the Built Environment + + +
+ With the number of people with disabilities (PWD) increasing worldwide each +year, the demand for mobility support to enable independent living and social +integration is also growing. Wheelchairs commonly support the mobility of PWD +in both indoor and outdoor environments. However, current powered wheelchairs +(PWC) often fail to meet the needs of PWD, who may find it difficult to operate +them. Furthermore, existing research on robotic wheelchairs typically focuses +either on full autonomy or enhanced manual control, which can lead to reduced +efficiency and user trust. To address these issues, this paper proposes a Robot +Operating System (ROS)-based smart wheelchair, called CoNav Chair, that +incorporates a shared control navigation algorithm and obstacle avoidance to +support PWD while fostering efficiency and trust between the robot and the +user. Our design consists of hardware and software components. Experimental +results conducted in a typical indoor social environment demonstrate the +performance and effectiveness of the smart wheelchair hardware and software +design. This integrated design promotes trust and autonomy, which are crucial +for the acceptance of assistive mobility technologies in the built environment. + +
+
+ comment: 8 pages, 9 figures +
+
+
+
+
+ + ☆ Model Predictive Path Integral Docking of Fully Actuated Surface Vessel + + +
+ Autonomous docking remains one of the most challenging maneuvers in marine +robotics, requiring precise control and robust perception in confined spaces. +This paper presents a novel approach integrating Model Predictive Path +Integral(MPPI) control with real-time LiDAR-based dock detection for autonomous +surface vessel docking. Our framework uniquely combines probabilistic +trajectory optimization with a multiobjective cost function that simultaneously +considers docking precision, safety constraints, and motion efficiency. The +MPPI controller generates optimal trajectories by intelligently sampling +control sequences and evaluating their costs based on dynamic clearance +requirements, orientation alignment, and target position objectives. We +introduce an adaptive dock detection pipeline that processes LiDAR point clouds +to extract critical geometric features, enabling real-time updates of docking +parameters. The proposed method is extensively validated in a physics-based +simulation environment that incorporates realistic sensor noise, vessel +dynamics, and environmental constraints. Results demonstrate successful docking +from various initial positions while maintaining safe clearances and smooth +motion characteristics. + +
+
+ comment: 6 pages, 6 figures, 1 table, UT2025 Conference, IEEE International + Symposium on Underwater Technology 2025 +
+
+
+
+
+ + ☆ Monte Carlo Tree Search with Velocity Obstacles for safe and efficient + motion planning in dynamic environments + + +
+ Online motion planning is a challenging problem for intelligent robots moving +in dense environments with dynamic obstacles, e.g., crowds. In this work, we +propose a novel approach for optimal and safe online motion planning with +minimal information about dynamic obstacles. Specifically, our approach +requires only the current position of the obstacles and their maximum speed, +but it does not need any information about their exact trajectories or dynamic +model. The proposed methodology combines Monte Carlo Tree Search (MCTS), for +online optimal planning via model simulations, with Velocity Obstacles (VO), +for obstacle avoidance. We perform experiments in a cluttered simulated +environment with walls, and up to 40 dynamic obstacles moving with random +velocities and directions. With an ablation study, we show the key contribution +of VO in scaling up the efficiency of MCTS, selecting the safest and most +rewarding actions in the tree of simulations. Moreover, we show the superiority +of our methodology with respect to state-of-the-art planners, including +Non-linear Model Predictive Control (NMPC), in terms of improved collision +rate, computational and task performance. + +
+
+
+
+
+ + ☆ Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid + Prototyping in Virtual Reality Applications + + +
+ SLAM is a foundational technique with broad applications in robotics and +AR/VR. SLAM simulations evaluate new concepts, but testing on +resource-constrained devices, such as VR HMDs, faces challenges: high +computational cost and restricted sensor data access. This work proposes a +sparse framework using mesh geometry projections as features, which improves +efficiency and circumvents direct sensor data access, advancing SLAM research +as we demonstrate in VR and through numerical evaluation. + +
+
+
+
+
+ + ☆ Comparison of Various SLAM Systems for Mobile Robot in an Indoor + Environment + + +
+ This article presents a comparative analysis of a mobile robot trajectories +computed by various ROS-based SLAM systems. For this reason we developed a +prototype of a mobile robot with common sensors: 2D lidar, a monocular and ZED +stereo cameras. Then we conducted experiments in a typical office environment +and collected data from all sensors, running all tested SLAM systems based on +the acquired dataset. We studied the following SLAM systems: (a) 2D +lidar-based: GMapping, Hector SLAM, Cartographer; (b) monocular camera-based: +Large Scale Direct monocular SLAM (LSD SLAM), ORB SLAM, Direct Sparse Odometry +(DSO); and (c) stereo camera-based: ZEDfu, Real-Time Appearance-Based Mapping +(RTAB map), ORB SLAM, Stereo Parallel Tracking and Mapping (S-PTAM). Since all +SLAM methods were tested on the same dataset we compared results for different +SLAM systems with appropriate metrics, demonstrating encouraging results for +lidar-based Cartographer SLAM, Monocular ORB SLAM and Stereo RTAB Map methods. + +
+
+ comment: 6 pages, 6 figures +
+
+
+
+
+ + ☆ Sensorimotor Control Strategies for Tactile Robotics + + +
+ How are robots becoming smarter at interacting with their surroundings? +Recent advances have reshaped how robots use tactile sensing to perceive and +engage with the world. Tactile sensing is a game-changer, allowing robots to +embed sensorimotor control strategies to interact with complex environments and +skillfully handle heterogeneous objects. Such control frameworks plan +contact-driven motions while staying responsive to sudden changes. We review +the latest methods for building perception and control systems in tactile +robotics while offering practical guidelines for their design and +implementation. We also address key challenges to shape the future of +intelligent robots. + +
+
+ comment: 39 pages, 8 figures, 1 table +
+
+
+
+
+ + ☆ Real-Time Generation of Near-Minimum-Energy Trajectories via + Constraint-Informed Residual Learning + + +
+ Industrial robotics demands significant energy to operate, making +energy-reduction methodologies increasingly important. Strategies for planning +minimum-energy trajectories typically involve solving nonlinear optimal control +problems (OCPs), which rarely cope with real-time requirements. In this paper, +we propose a paradigm for generating near minimum-energy trajectories for +manipulators by learning from optimal solutions. Our paradigm leverages a +residual learning approach, which embeds boundary conditions while focusing on +learning only the adjustments needed to steer a standard solution to an optimal +one. Compared to a computationally expensive OCP-based planner, our paradigm +achieves 87.3% of the performance near the training dataset and 50.8% far from +the dataset, while being two to three orders of magnitude faster. + +
+
+
+
+
+ + ☆ Path Planning for a UAV Swarm Using Formation Teaching-Learning-Based + Optimization + + +
+ This work addresses the path planning problem for a group of unmanned aerial +vehicles (UAVs) to maintain a desired formation during operation. Our approach +formulates the problem as an optimization task by defining a set of fitness +functions that not only ensure the formation but also include constraints for +optimal and safe UAV operation. To optimize the fitness function and obtain a +suboptimal path, we employ the teaching-learning-based optimization algorithm +and then further enhance it with mechanisms such as mutation, elite strategy, +and multi-subject combination. A number of simulations and experiments have +been conducted to evaluate the proposed method. The results demonstrate that +the algorithm successfully generates valid paths for the UAVs to fly in a +triangular formation for an inspection task. + +
+
+ comment: in Proceedings of the 2025 International Conference on Energy, + Infrastructure and Environmental Research (EIER2025) +
+
+
+
+
+ + ☆ Robust UAV Path Planning with Obstacle Avoidance for Emergency Rescue + + +
+ The unmanned aerial vehicles (UAVs) are efficient tools for diverse tasks +such as electronic reconnaissance, agricultural operations and disaster relief. +In the complex three-dimensional (3D) environments, the path planning with +obstacle avoidance for UAVs is a significant issue for security assurance. In +this paper, we construct a comprehensive 3D scenario with obstacles and no-fly +zones for dynamic UAV trajectory. Moreover, a novel artificial potential field +algorithm coupled with simulated annealing (APF-SA) is proposed to tackle the +robust path planning problem. APF-SA modifies the attractive and repulsive +potential functions and leverages simulated annealing to escape local minimum +and converge to globally optimal solutions. Simulation results demonstrate that +the effectiveness of APF-SA, enabling efficient autonomous path planning for +UAVs with obstacle avoidance. + +
+
+
+
+
+ + RoboReflect: Robotic Reflective Reasoning for Grasping + Ambiguous-Condition Objects + + +
+ As robotic technology rapidly develops, robots are being employed in an +increasing number of fields. However, due to the complexity of deployment +environments or the prevalence of ambiguous-condition objects, the practical +application of robotics still faces many challenges, leading to frequent +errors. Traditional methods and some LLM-based approaches, although improved, +still require substantial human intervention and struggle with autonomous error +correction in complex scenarios.In this work, we propose RoboReflect, a novel +framework leveraging large vision-language models (LVLMs) to enable +self-reflection and autonomous error correction in robotic grasping tasks. +RoboReflect allows robots to automatically adjust their strategies based on +unsuccessful attempts until successful execution is achieved.The corrected +strategies are saved in a memory for future task reference.We evaluate +RoboReflect through extensive testing on eight common objects prone to +ambiguous conditions of three categories.Our results demonstrate that +RoboReflect not only outperforms existing grasp pose estimation methods like +AnyGrasp and high-level action planning techniques using GPT-4V but also +significantly enhances the robot's ability to adapt and correct errors +independently. These findings underscore the critical importance of autonomous +selfreflection in robotic systems while effectively addressing the challenges +posed by ambiguous environments. + +
+
+
+
+
+ + ☆ Interoceptive Robots for Convergent Shared Control in Collaborative + Construction Work + + +
+ Building autonomous mobile robots (AMRs) with optimized efficiency and +adaptive capabilities-able to respond to changing task demands and dynamic +environments-is a strongly desired goal for advancing construction robotics. +Such robots can play a critical role in enabling automation, reducing +operational carbon footprints, and supporting modular construction processes. +Inspired by the adaptive autonomy of living organisms, we introduce +interoception, which centers on the robot's internal state representation, as a +foundation for developing self-reflection and conscious learning to enable +continual learning and adaptability in robotic agents. In this paper, we +factorize internal state variables and mathematical properties as "cognitive +dissonance" in shared control paradigms, where human interventions occasionally +occur. We offer a new perspective on how interoception can help build adaptive +motion planning in AMRs by integrating the legacy of heuristic costs from +grid/graph-based algorithms with recent advances in neuroscience and +reinforcement learning. Declarative and procedural knowledge extracted from +human semantic inputs is encoded into a hypergraph model that overlaps with the +spatial configuration of onsite layout for path planning. In addition, we +design a velocity-replay module using an encoder-decoder architecture with +few-shot learning to enable robots to replicate velocity profiles in +contextualized scenarios for multi-robot synchronization and handover +collaboration. These "cached" knowledge representations are demonstrated in +simulated environments for multi-robot motion planning and stacking tasks. The +insights from this study pave the way toward artificial general intelligence in +AMRs, fostering their progression from complexity to competence in construction +automation. + +
+
+
+
+
+ + ☆ ThinTact:Thin Vision-Based Tactile Sensor by Lensless Imaging + + +
+ Vision-based tactile sensors have drawn increasing interest in the robotics +community. However, traditional lens-based designs impose minimum thickness +constraints on these sensors, limiting their applicability in space-restricted +settings. In this paper, we propose ThinTact, a novel lensless vision-based +tactile sensor with a sensing field of over 200 mm2 and a thickness of less +than 10 mm.ThinTact utilizes the mask-based lensless imaging technique to map +the contact information to CMOS signals. To ensure real-time tactile sensing, +we propose a real-time lensless reconstruction algorithm that leverages a +frequency-spatial-domain joint filter based on discrete cosine transform (DCT). +This algorithm achieves computation significantly faster than existing +optimization-based methods. Additionally, to improve the sensing quality, we +develop a mask optimization method based on the generic algorithm and the +corresponding system matrix calibration algorithm.We evaluate the performance +of our proposed lensless reconstruction and tactile sensing through qualitative +and quantitative experiments. Furthermore, we demonstrate ThinTact's practical +applicability in diverse applications, including texture recognition and +contact-rich object manipulation. The paper will appear in the IEEE +Transactions on Robotics: https://ieeexplore.ieee.org/document/10842357. Video: +https://youtu.be/YrOO9BDMAHo + +
+
+ comment: \c{opyright} 2025 IEEE. Personal use of this material is permitted. + Permission from IEEE must be obtained for all other uses, in any current or + future media, including reprinting/republishing this material for advertising + or promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ☆ Are Open-Vocabulary Models Ready for Detection of MEP Elements on + Construction Sites + + +
+ The construction industry has long explored robotics and computer vision, yet +their deployment on construction sites remains very limited. These technologies +have the potential to revolutionize traditional workflows by enhancing +accuracy, efficiency, and safety in construction management. Ground robots +equipped with advanced vision systems could automate tasks such as monitoring +mechanical, electrical, and plumbing (MEP) systems. The present research +evaluates the applicability of open-vocabulary vision-language models compared +to fine-tuned, lightweight, closed-set object detectors for detecting MEP +components using a mobile ground robotic platform. A dataset collected with +cameras mounted on a ground robot was manually annotated and analyzed to +compare model performance. The results demonstrate that, despite the +versatility of vision-language models, fine-tuned lightweight models still +largely outperform them in specialized environments and for domain-specific +tasks. + +
+
+ comment: 4 pages, 3 figures +
+
+
+
+
+ + ☆ Torque Responsive Metamaterials Enable High Payload Soft Robot Arms + + +
+ Soft robots have struggled to support large forces and moments while also +supporting their own weight against gravity. This limits their ability to reach +certain configurations necessary for tasks such as inspection and pushing +objects up. We have overcome this limitation by creating an electrically driven +metamaterial soft arm using handed shearing auxetics (HSA) and bendable +extendable torque resistant (BETR) shafts. These use the large force and torque +capacity of HSAs and the nestable torque transmission of BETRs to create a +strong soft arm. We found that the HSA arm was able to push 2.3 kg vertically +and lift more than 600 g when positioned horizontally, supporting 0.33 Nm of +torque at the base. The arm is able to move between waypoints while carrying +the large payload and demonstrates consistent movement with path variance below +5 mm. The HSA arm's ability to perform active grasping with HSA grippers was +also demonstrated, requiring 20 N of pull force to dislodge the object. +Finally, we test the arm in a pipe inspection task. The arm is able to locate +all the defects while sliding against the inner surface of the pipe, +demonstrating its compliance. + +
+
+ comment: 9 pages, 8 figures, currently under review +
+
+
+
+
+ + ☆ GeoManip: Geometric Constraints as General Interfaces for Robot + Manipulation + + +
+ We present GeoManip, a framework to enable generalist robots to leverage +essential conditions derived from object and part relationships, as geometric +constraints, for robot manipulation. For example, cutting the carrot requires +adhering to a geometric constraint: the blade of the knife should be +perpendicular to the carrot's direction. By interpreting these constraints +through symbolic language representations and translating them into low-level +actions, GeoManip bridges the gap between natural language and robotic +execution, enabling greater generalizability across diverse even unseen tasks, +objects, and scenarios. Unlike vision-language-action models that require +extensive training, operates training-free by utilizing large foundational +models: a constraint generation module that predicts stage-specific geometric +constraints and a geometry parser that identifies object parts involved in +these constraints. A solver then optimizes trajectories to satisfy inferred +constraints from task descriptions and the scene. Furthermore, GeoManip learns +in-context and provides five appealing human-robot interaction features: +on-the-fly policy adaptation, learning from human demonstrations, learning from +failure cases, long-horizon action planning, and efficient data collection for +imitation learning. Extensive evaluations on both simulations and real-world +scenarios demonstrate GeoManip's state-of-the-art performance, with superior +out-of-distribution generalization while avoiding costly model training. + +
+
+ comment: 32 pages, 13 figures +
+
+
+
+
+ + ☆ SMPLest-X: Ultimate Scaling for Expressive Human Pose and Shape + Estimation + + +
+ Expressive human pose and shape estimation (EHPS) unifies body, hands, and +face motion capture with numerous applications. Despite encouraging progress, +current state-of-the-art methods focus on training innovative architectural +designs on confined datasets. In this work, we investigate the impact of +scaling up EHPS towards a family of generalist foundation models. 1) For data +scaling, we perform a systematic investigation on 40 EHPS datasets, +encompassing a wide range of scenarios that a model trained on any single +dataset cannot handle. More importantly, capitalizing on insights obtained from +the extensive benchmarking process, we optimize our training scheme and select +datasets that lead to a significant leap in EHPS capabilities. Ultimately, we +achieve diminishing returns at 10M training instances from diverse data +sources. 2) For model scaling, we take advantage of vision transformers (up to +ViT-Huge as the backbone) to study the scaling law of model sizes in EHPS. To +exclude the influence of algorithmic design, we base our experiments on two +minimalist architectures: SMPLer-X, which consists of an intermediate step for +hand and face localization, and SMPLest-X, an even simpler version that reduces +the network to its bare essentials and highlights significant advances in the +capture of articulated hands. With big data and the large model, the foundation +models exhibit strong performance across diverse test benchmarks and excellent +transferability to even unseen environments. Moreover, our finetuning strategy +turns the generalist into specialist models, allowing them to achieve further +performance boosts. Notably, our foundation models consistently deliver +state-of-the-art results on seven benchmarks such as AGORA, UBody, EgoBody, and +our proposed SynHand dataset for comprehensive hand evaluation. (Code is +available at: https://github.com/wqyin/SMPLest-X). + +
+
+ comment: An extension of SMPLer-X [arXiv:2309.17448]. Homepage: + https://caizhongang.com/projects/SMPLer-X/ +
+
+
+
+
+ + ♻ ☆ Global SLAM in Visual-Inertial Systems with 5G Time-of-Arrival + Integration + + +
+ This paper presents a novel approach that integrates 5G Time of Arrival (ToA) +measurements into ORB-SLAM3 to enable global localization and enhance mapping +capabilities for indoor drone navigation. We extend ORB-SLAM3's optimization +pipeline to jointly process ToA data from 5G base stations alongside visual and +inertial measurements while estimating system biases. This integration +transforms the inherently local SLAM estimates into globally referenced +trajectories and effectively resolves scale ambiguity in monocular +configurations. Our method is evaluated using five real-world indoor datasets +collected with RGB-D cameras and inertial measurement units (IMUs), +complemented by simulated 5G ToA measurements at 28 GHz and 78 GHz frequencies +using MATLAB and QuaDRiGa. Extensive experiments across four SLAM +configurations (RGB-D, RGB-D-Inertial, Monocular, and Monocular-Inertial) +demonstrate that ToA integration enables consistent global positioning across +all modes while significantly improving local accuracy in minimal sensor +setups. Notably, ToA-enhanced monocular SLAM achieves superior local accuracy +(6.3 cm average) compared to the RGB-D baseline (11.5 cm), and enables reliable +operation of monocular-inertial SLAM in scenarios where the baseline system +fails completely. While ToA integration offers limited local accuracy +improvements for sensor-rich configurations like RGB-D SLAM, it consistently +enables robust global localization. + +
+
+
+
+
+ + ♻ ☆ AeroHaptix: A Wearable Vibrotactile Feedback System for Enhancing + Collision Avoidance in UAV Teleoperation + + +
+ Haptic feedback enhances collision avoidance by providing directional +obstacle information to operators during unmanned aerial vehicle (UAV) +teleoperation. However, such feedback is often rendered via haptic joysticks, +which are unfamiliar to UAV operators and limited to single-direction force +feedback. Additionally, the direct coupling between the input device and the +feedback method diminishes operators' sense of control and induces oscillatory +movements. To overcome these limitations, we propose AeroHaptix, a wearable +haptic feedback system that uses spatial vibrations to simultaneously +communicate multiple obstacle directions to operators, without interfering with +their input control. The layout of vibrotactile actuators was optimized via a +perceptual study to eliminate perceptual biases and achieve uniform spatial +coverage. A novel rendering algorithm, MultiCBF, extended control barrier +functions to support multi-directional feedback. Our system evaluation showed +that compared to a no-feedback condition, AeroHaptix effectively reduced the +number of collisions and input disagreement. Furthermore, operators reported +that AeroHaptix was more helpful than force feedback, with improved situational +awareness and comparable workload. + +
+
+
+
+
+ + ♻ ☆ Learning Constraint Network from Demonstrations via Positive-Unlabeled + Learning with Memory Replay + + +
+ Planning for a wide range of real-world tasks necessitates to know and write +all constraints. However, instances exist where these constraints are either +unknown or challenging to specify accurately. A possible solution is to infer +the unknown constraints from expert demonstration. The majority of prior works +limit themselves to learning simple linear constraints, or require strong +knowledge of the true constraint parameterization or environmental model. To +mitigate these problems, this paper presents a positive-unlabeled (PU) learning +approach to infer a continuous, arbitrary and possibly nonlinear, constraint +from demonstration. From a PU learning view, We treat all data in +demonstrations as positive (feasible) data, and learn a (sub)-optimal policy to +generate high-reward-winning but potentially infeasible trajectories, which +serve as unlabeled data containing both feasible and infeasible states. Under +an assumption on data distribution, a feasible-infeasible classifier (i.e., +constraint model) is learned from the two datasets through a postprocessing PU +learning technique. The entire method employs an iterative framework +alternating between updating the policy, which generates and selects +higher-reward policies, and updating the constraint model. Additionally, a +memory buffer is introduced to record and reuse samples from previous +iterations to prevent forgetting. The effectiveness of the proposed method is +validated in two Mujoco environments, successfully inferring continuous +nonlinear constraints and outperforming a baseline method in terms of +constraint accuracy and policy safety. + +
+
+
+
+
+ + ♻ ☆ Positive-Unlabeled Constraint Learning for Inferring Nonlinear + Continuous Constraints Functions from Expert Demonstrations + + +
+ Planning for diverse real-world robotic tasks necessitates to know and write +all constraints. However, instances exist where these constraints are either +unknown or challenging to specify accurately. A possible solution is to infer +the unknown constraints from expert demonstration. This paper presents a novel +two-step Positive-Unlabeled Constraint Learning (PUCL) algorithm to infer a +continuous constraint function from demonstrations, without requiring prior +knowledge of the true constraint parameterization or environmental model as +existing works. We treat all data in demonstrations as positive (feasible) +data, and learn a control policy to generate potentially infeasible +trajectories, which serve as unlabeled data. The proposed two-step learning +framework first identifies reliable infeasible data using a distance metric, +and secondly learns a binary feasibility classifier (i.e., constraint function) +from the feasible demonstrations and reliable infeasible data. The proposed +method is flexible to learn complex-shaped constraint boundary and will not +mistakenly classify demonstrations as infeasible as previous methods. The +effectiveness of the proposed method is verified in four constrained +environments, using a networked policy or a dynamical system policy. It +successfully infers the continuous nonlinear constraints and outperforms other +baseline methods in terms of constraint accuracy and policy safety. This work +has been published in IEEE Robotics and Automation Letters (RA-L). Please refer +to the final version at https://doi.org/10.1109/LRA.2024.3522756 + +
+
+
+
+
+ + ♻ ☆ Humanoid Robot RHP Friends: Seamless Combination of Autonomous and + Teleoperated Tasks in a Nursing Context + + +
+ This paper describes RHP Friends, a social humanoid robot developed to enable +assistive robotic deployments in human-coexisting environments. As a use-case +application, we present its potential use in nursing by extending its +capabilities to operate human devices and tools according to the task and by +enabling remote assistance operations. To meet a wide variety of tasks and +situations in environments designed by and for humans, we developed a system +that seamlessly integrates the slim and lightweight robot and several +technologies: locomanipulation, multi-contact motion, teleoperation, and object +detection and tracking. We demonstrated the system's usage in a nursing +application. The robot efficiently performed the daily task of patient transfer +and a non-routine task, represented by a request to operate a circuit breaker. +This demonstration, held at the 2023 International Robot Exhibition (IREX), +conducted three times a day over three days. + +
+
+ comment: IEEE Robotics and Automation Magazine, In press +
+
+
+
+
+ + ♻ ☆ Equivariant IMU Preintegration with Biases: a Galilean Group Approach + + +
+ This letter proposes a new approach for Inertial Measurement Unit (IMU) +preintegration, a fundamental building block that can be leveraged in different +optimization-based Inertial Navigation System (INS) localization solutions. +Inspired by recent advances in equivariant theory applied to biased INSs, we +derive a discrete-time formulation of the IMU preintegration on +${\mathbf{Gal}(3) \ltimes \mathfrak{gal}(3)}$, the left-trivialization of the +tangent group of the Galilean group $\mathbf{Gal}(3)$. We define a novel +preintegration error that geometrically couples the navigation states and the +bias leading to lower linearization error. Our method improves in consistency +compared to existing preintegration approaches which treat IMU biases as a +separate state-space. Extensive validation against state-of-the-art methods, +both in simulation and with real-world IMU data, implementation in the Lie++ +library, and open-source code are provided. + +
+
+
+
+
+ + ♻ ☆ PO-GVINS: Tightly Coupled GNSS-Visual-Inertial Integration with + Pose-Only Representation + + +
+ Accurate and reliable positioning is crucial for perception, decision-making, +and other high-level applications in autonomous driving, unmanned aerial +vehicles, and intelligent robots. Given the inherent limitations of standalone +sensors, integrating heterogeneous sensors with complementary capabilities is +one of the most effective approaches to achieving this goal. In this paper, we +propose a filtering-based, tightly coupled global navigation satellite system +(GNSS)-visual-inertial positioning framework with a pose-only formulation +applied to the visual-inertial system (VINS), termed PO-GVINS. Specifically, +multiple-view imaging used in current VINS requires a priori of 3D feature, +then jointly estimate camera poses and 3D feature position, which inevitably +introduces linearization error of the feature as well as facing dimensional +explosion. However, the pose-only (PO) formulation, which is demonstrated to be +equivalent to the multiple-view imaging and has been applied in visual +reconstruction, represent feature depth using two camera poses and thus 3D +feature position is removed from state vector avoiding aforementioned +difficulties. Inspired by this, we first apply PO formulation in our VINS, +i.e., PO-VINS. GNSS raw measurements are then incorporated with integer +ambiguity resolved to achieve accurate and drift-free estimation. Extensive +experiments demonstrate that the proposed PO-VINS significantly outperforms the +multi-state constrained Kalman filter (MSCKF). By incorporating GNSS +measurements, PO-GVINS achieves accurate, drift-free state estimation, making +it a robust solution for positioning in challenging environments. + +
+
+
+
+
+ + ♻ ☆ Autonomous Algorithm for Training Autonomous Vehicles with Minimal Human + Intervention + + +
+ Recent reinforcement learning (RL) algorithms have demonstrated impressive +results in simulated driving environments. However, autonomous vehicles trained +in simulation often struggle to work well in the real world due to the fidelity +gap between simulated and real-world environments. While directly training +real-world autonomous vehicles with RL algorithms is a promising approach to +bypass the fidelity gap problem, it presents several challenges. One critical +yet often overlooked challenge is the need to reset a driving environment +between every episode. This reset process demands significant human +intervention, leading to poor training efficiency in the real world. In this +paper, we introduce a novel autonomous algorithm that enables off-the-shelf RL +algorithms to train autonomous vehicles with minimal human intervention. Our +algorithm reduces unnecessary human intervention by aborting episodes to +prevent unsafe states and identifying informative initial states for subsequent +episodes. The key idea behind identifying informative initial states is to +estimate the expected amount of information that can be obtained from +under-explored but reachable states. Our algorithm also revisits rule-based +autonomous driving algorithms and highlights their benefits in safely returning +an autonomous vehicle to initial states. To evaluate how much human +intervention is required during training, we implement challenging urban +driving tasks that require an autonomous vehicle to reset to initial states on +its own. The experimental results show that our autonomous algorithm is +task-agnostic and achieves competitive driving performance with much less human +intervention than baselines. + +
+
+ comment: 8 pages, 6 figures, 2 tables, conference +
+
+
+
+
+ + ♻ ☆ Gameplay Filters: Robust Zero-Shot Safety through Adversarial + Imagination + + +
+ Despite the impressive recent advances in learning-based robot control, +ensuring robustness to out-of-distribution conditions remains an open +challenge. Safety filters can, in principle, keep arbitrary control policies +from incurring catastrophic failures by overriding unsafe actions, but existing +solutions for complex (e.g., legged) robot dynamics do not span the full motion +envelope and instead rely on local, reduced-order models. These filters tend to +overly restrict agility and can still fail when perturbed away from nominal +conditions. This paper presents the gameplay filter, a new class of predictive +safety filter that continually plays out hypothetical matches between its +simulation-trained safety strategy and a virtual adversary co-trained to invoke +worst-case events and sim-to-real error, and precludes actions that would cause +failures down the line. We demonstrate the scalability and robustness of the +approach with a first-of-its-kind full-order safety filter for (36-D) +quadrupedal dynamics. Physical experiments on two different quadruped platforms +demonstrate the superior zero-shot effectiveness of the gameplay filter under +large perturbations such as tugging and unmodeled terrain. Experiment videos +and open-source software are available online: +https://saferobotics.org/research/gameplay-filter + +
+
+
+
+
+ + ♻ ☆ The Dark Side of Rich Rewards: Understanding and Mitigating Noise in VLM + Rewards + + +
+ While Vision-Language Models (VLMs) are increasingly used to generate reward +signals for training embodied agents to follow instructions, our research +reveals that agents guided by VLM rewards often underperform compared to those +employing only intrinsic (exploration-driven) rewards, contradicting +expectations set by recent work. We hypothesize that false positive rewards -- +instances where unintended trajectories are incorrectly rewarded -- are more +detrimental than false negatives. Our analysis confirms this hypothesis, +revealing that the widely used cosine similarity metric is prone to false +positive reward estimates. To address this, we introduce BiMI ({Bi}nary +{M}utual {I}nformation), a novel reward function designed to mitigate noise. +BiMI significantly enhances learning efficiency across diverse and challenging +embodied navigation environments. Our findings offer a nuanced understanding of +how different types of reward noise impact agent learning and highlight the +importance of addressing multimodal reward signal noise when training embodied +agents + +
+
+ comment: 11 main body pages, 21 appendix pages +
+
+
+
+
+ + ♻ ☆ Visual collective behaviors on spherical robots + + +
+ The implementation of collective motion, traditionally, disregard the limited +sensing capabilities of an individual, to instead assuming an omniscient +perception of the environment. This study implements a visual flocking model in +a ``robot-in-the-loop'' approach to reproduce these behaviors with a flock +composed of 10 independent spherical robots. The model achieves robotic +collective motion by only using panoramic visual information of each robot, +such as retinal position, optical size and optic flow of the neighboring +robots. We introduce a virtual anchor to confine the collective robotic +movements so to avoid wall interactions. For the first time, a simple visual +robot-in-the-loop approach succeed in reproducing several collective motion +phases, in particular, swarming, and milling. Another milestone achieved with +by this model is bridging the gap between simulation and physical experiments +by demonstrating nearly identical behaviors in both environments with the same +visual model. To conclude, we show that our minimal visual collective motion +model is sufficient to recreate most collective behaviors on a +robot-in-the-loop system that is scalable, behaves as numerical simulations +predict and is easily comparable to traditional models. + +
+
+ comment: 26 pages, 16 figures, journal bioinspired and biomimetics +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 110 + +
+
+
+ + ☆ Distilling Multi-modal Large Language Models for Autonomous Driving + + +
+ Autonomous driving demands safe motion planning, especially in critical +"long-tail" scenarios. Recent end-to-end autonomous driving systems leverage +large language models (LLMs) as planners to improve generalizability to rare +events. However, using LLMs at test time introduces high computational costs. +To address this, we propose DiMA, an end-to-end autonomous driving system that +maintains the efficiency of an LLM-free (or vision-based) planner while +leveraging the world knowledge of an LLM. DiMA distills the information from a +multi-modal LLM to a vision-based end-to-end planner through a set of specially +designed surrogate tasks. Under a joint training strategy, a scene encoder +common to both networks produces structured representations that are +semantically grounded as well as aligned to the final planning objective. +Notably, the LLM is optional at inference, enabling robust planning without +compromising on efficiency. Training with DiMA results in a 37% reduction in +the L2 trajectory error and an 80% reduction in the collision rate of the +vision-based planner, as well as a 44% trajectory error reduction in longtail +scenarios. DiMA also achieves state-of-the-art performance on the nuScenes +planning benchmark. + +
+
+
+
+
+ + ☆ SynthLight: Portrait Relighting with Diffusion Model by Learning to + Re-render Synthetic Faces + + +
+ We introduce SynthLight, a diffusion model for portrait relighting. Our +approach frames image relighting as a re-rendering problem, where pixels are +transformed in response to changes in environmental lighting conditions. Using +a physically-based rendering engine, we synthesize a dataset to simulate this +lighting-conditioned transformation with 3D head assets under varying lighting. +We propose two training and inference strategies to bridge the gap between the +synthetic and real image domains: (1) multi-task training that takes advantage +of real human portraits without lighting labels; (2) an inference time +diffusion sampling procedure based on classifier-free guidance that leverages +the input portrait to better preserve details. Our method generalizes to +diverse real photographs and produces realistic illumination effects, including +specular highlights and cast shadows, while preserving the subject's identity. +Our quantitative experiments on Light Stage data demonstrate results comparable +to state-of-the-art relighting methods. Our qualitative results on in-the-wild +images showcase rich and unprecedented illumination effects. Project Page: +\url{https://vrroom.github.io/synthlight/} + +
+
+ comment: 27 pages, 25 figures, Project Page + https://vrroom.github.io/synthlight/ +
+
+
+
+
+ + ☆ Learnings from Scaling Visual Tokenizers for Reconstruction and + Generation + + +
+ Visual tokenization via auto-encoding empowers state-of-the-art image and +video generative models by compressing pixels into a latent space. Although +scaling Transformer-based generators has been central to recent advances, the +tokenizer component itself is rarely scaled, leaving open questions about how +auto-encoder design choices influence both its objective of reconstruction and +downstream generative performance. Our work aims to conduct an exploration of +scaling in auto-encoders to fill in this blank. To facilitate this exploration, +we replace the typical convolutional backbone with an enhanced Vision +Transformer architecture for Tokenization (ViTok). We train ViTok on +large-scale image and video datasets far exceeding ImageNet-1K, removing data +constraints on tokenizer scaling. We first study how scaling the auto-encoder +bottleneck affects both reconstruction and generation -- and find that while it +is highly correlated with reconstruction, its relationship with generation is +more complex. We next explored the effect of separately scaling the +auto-encoders' encoder and decoder on reconstruction and generation +performance. Crucially, we find that scaling the encoder yields minimal gains +for either reconstruction or generation, while scaling the decoder boosts +reconstruction but the benefits for generation are mixed. Building on our +exploration, we design ViTok as a lightweight auto-encoder that achieves +competitive performance with state-of-the-art auto-encoders on ImageNet-1K and +COCO reconstruction tasks (256p and 512p) while outperforming existing +auto-encoders on 16-frame 128p video reconstruction for UCF-101, all with 2-5x +fewer FLOPs. When integrated with Diffusion Transformers, ViTok demonstrates +competitive performance on image generation for ImageNet-1K and sets new +state-of-the-art benchmarks for class-conditional video generation on UCF-101. + +
+
+ comment: 28 pages, 25 figures, 7 Tables +
+
+
+
+
+ + ☆ Lost in Translation, Found in Context: Sign Language Translation with + Contextual Cues + + +
+ Our objective is to translate continuous sign language into spoken language +text. Inspired by the way human interpreters rely on context for accurate +translation, we incorporate additional contextual cues together with the +signing video, into a new translation framework. Specifically, besides visual +sign recognition features that encode the input video, we integrate +complementary textual information from (i) captions describing the background +show, (ii) translation of previous sentences, as well as (iii) pseudo-glosses +transcribing the signing. These are automatically extracted and inputted along +with the visual features to a pre-trained large language model (LLM), which we +fine-tune to generate spoken language translations in text form. Through +extensive ablation studies, we show the positive contribution of each input cue +to the translation performance. We train and evaluate our approach on BOBSL -- +the largest British Sign Language dataset currently available. We show that our +contextual approach significantly enhances the quality of the translations +compared to previously reported results on BOBSL, and also to state-of-the-art +methods that we implement as baselines. Furthermore, we demonstrate the +generality of our approach by applying it also to How2Sign, an American Sign +Language dataset, and achieve competitive results. + +
+
+
+
+
+ + ☆ SRE-Conv: Symmetric Rotation Equivariant Convolution for Biomedical + Image Classification + + +
+ Convolutional neural networks (CNNs) are essential tools for computer vision +tasks, but they lack traditionally desired properties of extracted features +that could further improve model performance, e.g., rotational equivariance. +Such properties are ubiquitous in biomedical images, which often lack explicit +orientation. While current work largely relies on data augmentation or explicit +modules to capture orientation information, this comes at the expense of +increased training costs or ineffective approximations of the desired +equivariance. To overcome these challenges, we propose a novel and efficient +implementation of the Symmetric Rotation-Equivariant (SRE) Convolution +(SRE-Conv) kernel, designed to learn rotation-invariant features while +simultaneously compressing the model size. The SRE-Conv kernel can easily be +incorporated into any CNN backbone. We validate the ability of a deep SRE-CNN +to capture equivariance to rotation using the public MedMNISTv2 dataset (16 +total tasks). SRE-Conv-CNN demonstrated improved rotated image classification +performance accuracy on all 16 test datasets in both 2D and 3D images, all +while increasing efficiency with fewer parameters and reduced memory footprint. +The code is available at https://github.com/XYPB/SRE-Conv. + +
+
+ comment: Accepted by IEEE ISBI 2025 4-page paper +
+
+
+
+
+ + ☆ ComplexVAD: Detecting Interaction Anomalies in Video + + +
+ Existing video anomaly detection datasets are inadequate for representing +complex anomalies that occur due to the interactions between objects. The +absence of complex anomalies in previous video anomaly detection datasets +affects research by shifting the focus onto simple anomalies. To address this +problem, we introduce a new large-scale dataset: ComplexVAD. In addition, we +propose a novel method to detect complex anomalies via modeling the +interactions between objects using a scene graph with spatio-temporal +attributes. With our proposed method and two other state-of-the-art video +anomaly detection methods, we obtain baseline scores on ComplexVAD and +demonstrate that our new method outperforms existing works. + +
+
+ comment: 16 pages, 11 figures, to appear in WACV Workshop ASTAD 2025 +
+
+
+
+
+ + ☆ Inference-Time Scaling for Diffusion Models beyond Scaling Denoising + Steps + + +
+ Generative models have made significant impacts across various domains, +largely due to their ability to scale during training by increasing data, +computational resources, and model size, a phenomenon characterized by the +scaling laws. Recent research has begun to explore inference-time scaling +behavior in Large Language Models (LLMs), revealing how performance can further +improve with additional computation during inference. Unlike LLMs, diffusion +models inherently possess the flexibility to adjust inference-time computation +via the number of denoising steps, although the performance gains typically +flatten after a few dozen. In this work, we explore the inference-time scaling +behavior of diffusion models beyond increasing denoising steps and investigate +how the generation performance can further improve with increased computation. +Specifically, we consider a search problem aimed at identifying better noises +for the diffusion sampling process. We structure the design space along two +axes: the verifiers used to provide feedback, and the algorithms used to find +better noise candidates. Through extensive experiments on class-conditioned and +text-conditioned image generation benchmarks, our findings reveal that +increasing inference-time compute leads to substantial improvements in the +quality of samples generated by diffusion models, and with the complicated +nature of images, combinations of the components in the framework can be +specifically chosen to conform with different application scenario. + +
+
+
+
+
+ + ☆ A Simple Aerial Detection Baseline of Multimodal Language Models + + +
+ The multimodal language models (MLMs) based on generative pre-trained +Transformer are considered powerful candidates for unifying various domains and +tasks. MLMs developed for remote sensing (RS) have demonstrated outstanding +performance in multiple tasks, such as visual question answering and visual +grounding. In addition to visual grounding that detects specific objects +corresponded to given instruction, aerial detection, which detects all objects +of multiple categories, is also a valuable and challenging task for RS +foundation models. However, aerial detection has not been explored by existing +RS MLMs because the autoregressive prediction mechanism of MLMs differs +significantly from the detection outputs. In this paper, we present a simple +baseline for applying MLMs to aerial detection for the first time, named +LMMRotate. Specifically, we first introduce a normalization method to transform +detection outputs into textual outputs to be compatible with the MLM framework. +Then, we propose a evaluation method, which ensures a fair comparison between +MLMs and conventional object detection models. We construct the baseline by +fine-tuning open-source general-purpose MLMs and achieve impressive detection +performance comparable to conventional detector. We hope that this baseline +will serve as a reference for future MLM development, enabling more +comprehensive capabilities for understanding RS images. Code is available at +https://github.com/Li-Qingyun/mllm-mmrotate. + +
+
+ comment: 4 pages, 1 table, 4 figures +
+
+
+
+
+ + ☆ FLOL: Fast Baselines for Real-World Low-Light Enhancement + + +
+ Low-Light Image Enhancement (LLIE) is a key task in computational photography +and imaging. The problem of enhancing images captured during night or in dark +environments has been well-studied in the image signal processing literature. +However, current deep learning-based solutions struggle with efficiency and +robustness in real-world scenarios (e.g. scenes with noise, saturated pixels, +bad illumination). We propose a lightweight neural network that combines image +processing in the frequency and spatial domains. Our method, FLOL+, is one of +the fastest models for this task, achieving state-of-the-art results on popular +real scenes datasets such as LOL and LSRW. Moreover, we are able to process +1080p images under 12ms. Code and models at https://github.com/cidautai/FLOL + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Practical Continual Forgetting for Pre-trained Vision Models + + +
+ For privacy and security concerns, the need to erase unwanted information +from pre-trained vision models is becoming evident nowadays. In real-world +scenarios, erasure requests originate at any time from both users and model +owners, and these requests usually form a sequence. Therefore, under such a +setting, selective information is expected to be continuously removed from a +pre-trained model while maintaining the rest. We define this problem as +continual forgetting and identify three key challenges. (i) For unwanted +knowledge, efficient and effective deleting is crucial. (ii) For remaining +knowledge, the impact brought by the forgetting procedure should be minimal. +(iii) In real-world scenarios, the training samples may be scarce or partially +missing during the process of forgetting. To address them, we first propose +Group Sparse LoRA (GS-LoRA). Specifically, towards (i), we introduce LoRA +modules to fine-tune the FFN layers in Transformer blocks for each forgetting +task independently, and towards (ii), a simple group sparse regularization is +adopted, enabling automatic selection of specific LoRA groups and zeroing out +the others. To further extend GS-LoRA to more practical scenarios, we +incorporate prototype information as additional supervision and introduce a +more practical approach, GS-LoRA++. For each forgotten class, we move the +logits away from its original prototype. For the remaining classes, we pull the +logits closer to their respective prototypes. We conduct extensive experiments +on face recognition, object detection and image classification and demonstrate +that our method manages to forget specific classes with minimal impact on other +classes. Codes have been released on https://github.com/bjzhb666/GS-LoRA. + +
+
+
+
+
+ + ☆ Mitigating Hallucinations in Large Vision-Language Models via DPO: + On-Policy Data Hold the Key + + +
+ Hallucination remains a major challenge for Large Vision-Language Models +(LVLMs). Direct Preference Optimization (DPO) has gained increasing attention +as a simple solution to hallucination issues. It directly learns from +constructed preference pairs that reflect the severity of hallucinations in +responses to the same prompt and image. Nonetheless, different data +construction methods in existing works bring notable performance variations. We +identify a crucial factor here: outcomes are largely contingent on whether the +constructed data aligns on-policy w.r.t the initial (reference) policy of DPO. +Theoretical analysis suggests that learning from off-policy data is impeded by +the presence of KL-divergence between the updated policy and the reference +policy. From the perspective of dataset distribution, we systematically +summarize the inherent flaws in existing algorithms that employ DPO to address +hallucination issues. To alleviate the problems, we propose On-Policy Alignment +(OPA)-DPO framework, which uniquely leverages expert feedback to correct +hallucinated responses and aligns both the original and expert-revised +responses in an on-policy manner. Notably, with only 4.8k data, OPA-DPO +achieves an additional reduction in the hallucination rate of LLaVA-1.5-7B: +13.26% on the AMBER benchmark and 5.39% on the Object-Hal benchmark, compared +to the previous SOTA algorithm trained with 16k samples. + +
+
+ comment: 18 pages, 15 figures +
+
+
+
+
+ + ☆ Fine-Grained Image-Text Correspondence with Cost Aggregation for + Open-Vocabulary Part Segmentation + + +
+ Open-Vocabulary Part Segmentation (OVPS) is an emerging field for recognizing +fine-grained parts in unseen categories. We identify two primary challenges in +OVPS: (1) the difficulty in aligning part-level image-text correspondence, and +(2) the lack of structural understanding in segmenting object parts. To address +these issues, we propose PartCATSeg, a novel framework that integrates +object-aware part-level cost aggregation, compositional loss, and structural +guidance from DINO. Our approach employs a disentangled cost aggregation +strategy that handles object and part-level costs separately, enhancing the +precision of part-level segmentation. We also introduce a compositional loss to +better capture part-object relationships, compensating for the limited part +annotations. Additionally, structural guidance from DINO features improves +boundary delineation and inter-part understanding. Extensive experiments on +Pascal-Part-116, ADE20K-Part-234, and PartImageNet datasets demonstrate that +our method significantly outperforms state-of-the-art approaches, setting a new +baseline for robust generalization to unseen part categories. + +
+
+
+
+
+ + ☆ Robin: a Suite of Multi-Scale Vision-Language Models and the CHIRP + Evaluation Benchmark + + +
+ The proliferation of Vision-Language Models (VLMs) in the past several years +calls for rigorous and comprehensive evaluation methods and benchmarks. This +work analyzes existing VLM evaluation techniques, including automated metrics, +AI-based assessments, and human evaluations across diverse tasks. We first +introduce Robin - a novel suite of VLMs that we built by combining Large +Language Models (LLMs) and Vision Encoders (VEs) at multiple scales, and use +Robin to identify shortcomings of current evaluation approaches across scales. +Next, to overcome the identified limitations, we introduce CHIRP - a new long +form response benchmark we developed for more robust and complete VLM +evaluation. We provide open access to the Robin training code, model suite, and +CHIRP benchmark to promote reproducibility and advance VLM research. + +
+
+
+
+
+ + ☆ Unified Face Matching and Physical-Digital Spoofing Attack Detection + + +
+ Face recognition technology has dramatically transformed the landscape of +security, surveillance, and authentication systems, offering a user-friendly +and non-invasive biometric solution. However, despite its significant +advantages, face recognition systems face increasing threats from physical and +digital spoofing attacks. Current research typically treats face recognition +and attack detection as distinct classification challenges. This approach +necessitates the implementation of separate models for each task, leading to +considerable computational complexity, particularly on devices with limited +resources. Such inefficiencies can stifle scalability and hinder performance. +In response to these challenges, this paper introduces an innovative unified +model designed for face recognition and detection of physical and digital +attacks. By leveraging the advanced Swin Transformer backbone and incorporating +HiLo attention in a convolutional neural network framework, we address unified +face recognition and spoof attack detection more effectively. Moreover, we +introduce augmentation techniques that replicate the traits of physical and +digital spoofing cues, significantly enhancing our model robustness. Through +comprehensive experimental evaluation across various datasets, we showcase the +effectiveness of our model in unified face recognition and spoof detection. +Additionally, we confirm its resilience against unseen physical and digital +spoofing attacks, underscoring its potential for real-world applications. + +
+
+
+
+
+ + ☆ WMamba: Wavelet-based Mamba for Face Forgery Detection + + +
+ With the rapid advancement of deepfake generation technologies, the demand +for robust and accurate face forgery detection algorithms has become +increasingly critical. Recent studies have demonstrated that wavelet analysis +can uncover subtle forgery artifacts that remain imperceptible in the spatial +domain. Wavelets effectively capture important facial contours, which are often +slender, fine-grained, and global in nature. However, existing wavelet-based +approaches fail to fully leverage these unique characteristics, resulting in +sub-optimal feature extraction and limited generalizability. To address this +challenge, we introduce WMamba, a novel wavelet-based feature extractor built +upon the Mamba architecture. WMamba maximizes the utility of wavelet +information through two key innovations. First, we propose Dynamic Contour +Convolution (DCConv), which employs specially crafted deformable kernels to +adaptively model slender facial contours. Second, by leveraging the Mamba +architecture, our method captures long-range spatial relationships with linear +computational complexity. This efficiency allows for the extraction of +fine-grained, global forgery artifacts from small image patches. Extensive +experimental results show that WMamba achieves state-of-the-art (SOTA) +performance, highlighting its effectiveness and superiority in face forgery +detection. + +
+
+
+
+
+ + ☆ Metric Learning with Progressive Self-Distillation for Audio-Visual + Embedding Learning + + +
+ Metric learning projects samples into an embedded space, where similarities +and dissimilarities are quantified based on their learned representations. +However, existing methods often rely on label-guided representation learning, +where representations of different modalities, such as audio and visual data, +are aligned based on annotated labels. This approach tends to underutilize +latent complex features and potential relationships inherent in the +distributions of audio and visual data that are not directly tied to the +labels, resulting in suboptimal performance in audio-visual embedding learning. +To address this issue, we propose a novel architecture that integrates +cross-modal triplet loss with progressive self-distillation. Our method +enhances representation learning by leveraging inherent distributions and +dynamically refining soft audio-visual alignments -- probabilistic alignments +between audio and visual data that capture the inherent relationships beyond +explicit labels. Specifically, the model distills audio-visual +distribution-based knowledge from annotated labels in a subset of each batch. +This self-distilled knowledge is used t + +
+
+ comment: 5 pages, 3 figures, 2 tables. Accepted by ICASSP 2025 +
+
+
+
+
+ + ☆ Mesh2SLAM in VR: A Fast Geometry-Based SLAM Framework for Rapid + Prototyping in Virtual Reality Applications + + +
+ SLAM is a foundational technique with broad applications in robotics and +AR/VR. SLAM simulations evaluate new concepts, but testing on +resource-constrained devices, such as VR HMDs, faces challenges: high +computational cost and restricted sensor data access. This work proposes a +sparse framework using mesh geometry projections as features, which improves +efficiency and circumvents direct sensor data access, advancing SLAM research +as we demonstrate in VR and through numerical evaluation. + +
+
+
+
+
+ + ☆ Sequential PatchCore: Anomaly Detection for Surface Inspection using + Synthetic Impurities + + +
+ The appearance of surface impurities (e.g., water stains, fingerprints, +stickers) is an often-mentioned issue that causes degradation of automated +visual inspection systems. At the same time, synthetic data generation +techniques for visual surface inspection have focused primarily on generating +perfect examples and defects, disregarding impurities. This study highlights +the importance of considering impurities when generating synthetic data. We +introduce a procedural method to include photorealistic water stains in +synthetic data. The synthetic datasets are generated to correspond to real +datasets and are further used to train an anomaly detection model and +investigate the influence of water stains. The high-resolution images used for +surface inspection lead to memory bottlenecks during anomaly detection +training. To address this, we introduce Sequential PatchCore - a method to +build coresets sequentially and make training on large images using +consumer-grade hardware tractable. This allows us to perform transfer learning +using coresets pre-trained on different dataset versions. Our results show the +benefits of using synthetic data for pre-training an explicit coreset anomaly +model and the extended performance benefits of finetuning the coreset using +real data. We observed how the impurities and labelling ambiguity lower the +model performance and have additionally reported the defect-wise recall to +provide an industrially relevant perspective on model performance. + +
+
+
+
+
+ + ☆ A New Teacher-Reviewer-Student Framework for Semi-supervised 2D Human + Pose Estimation + + +
+ Conventional 2D human pose estimation methods typically require extensive +labeled annotations, which are both labor-intensive and expensive. In contrast, +semi-supervised 2D human pose estimation can alleviate the above problems by +leveraging a large amount of unlabeled data along with a small portion of +labeled data. Existing semi-supervised 2D human pose estimation methods update +the network through backpropagation, ignoring crucial historical information +from the previous training process. Therefore, we propose a novel +semi-supervised 2D human pose estimation method by utilizing a newly designed +Teacher-Reviewer-Student framework. Specifically, we first mimic the phenomenon +that human beings constantly review previous knowledge for consolidation to +design our framework, in which the teacher predicts results to guide the +student's learning and the reviewer stores important historical parameters to +provide additional supervision signals. Secondly, we introduce a Multi-level +Feature Learning strategy, which utilizes the outputs from different stages of +the backbone to estimate the heatmap to guide network training, enriching the +supervisory information while effectively capturing keypoint relationships. +Finally, we design a data augmentation strategy, i.e., Keypoint-Mix, to perturb +pose information by mixing different keypoints, thus enhancing the network's +ability to discern keypoints. Extensive experiments on publicly available +datasets, demonstrate our method achieves significant improvements compared to +the existing methods. + +
+
+
+
+
+ + ☆ Text-driven Adaptation of Foundation Models for Few-shot Surgical + Workflow Analysis + + +
+ Purpose: Surgical workflow analysis is crucial for improving surgical +efficiency and safety. However, previous studies rely heavily on large-scale +annotated datasets, posing challenges in cost, scalability, and reliance on +expert annotations. To address this, we propose Surg-FTDA (Few-shot Text-driven +Adaptation), designed to handle various surgical workflow analysis tasks with +minimal paired image-label data. + Methods: Our approach has two key components. First, Few-shot selection-based +modality alignment selects a small subset of images and aligns their embeddings +with text embeddings from the downstream task, bridging the modality gap. +Second, Text-driven adaptation leverages only text data to train a decoder, +eliminating the need for paired image-text data. This decoder is then applied +to aligned image embeddings, enabling image-related tasks without explicit +image-text pairs. + Results: We evaluate our approach to generative tasks (image captioning) and +discriminative tasks (triplet recognition and phase recognition). Results show +that Surg-FTDA outperforms baselines and generalizes well across downstream +tasks. + Conclusion: We propose a text-driven adaptation approach that mitigates the +modality gap and handles multiple downstream tasks in surgical workflow +analysis, with minimal reliance on large annotated datasets. The code and +dataset will be released in https://github.com/TingxuanSix/Surg-FTDA. + +
+
+
+
+
+ + ☆ Exploring AI-based System Design for Pixel-level Protected Health + Information Detection in Medical Images + + +
+ De-identification of medical images is a critical step to ensure privacy +during data sharing in research and clinical settings. The initial step in this +process involves detecting Protected Health Information (PHI), which can be +found in image metadata or imprinted within image pixels. Despite the +importance of such systems, there has been limited evaluation of existing +AI-based solutions, creating barriers to the development of reliable and robust +tools. In this study, we present an AI-based pipeline for PHI detection, +comprising three key components: text detection, text extraction, and analysis +of PHI content in medical images. By experimenting with exchanging roles of +vision and language models within the pipeline, we evaluate the performance and +recommend the best setup for the PHI detection task. + +
+
+ comment: In progress +
+
+
+
+
+ + ☆ AdaFV: Accelerating VLMs with Self-Adaptive Cross-Modality Attention + Mixture + + +
+ The success of VLMs often relies on the dynamic high-resolution schema that +adaptively augments the input images to multiple crops, so that the details of +the images can be retained. However, such approaches result in a large number +of redundant visual tokens, thus significantly reducing the efficiency of the +VLMs. To improve the VLMs' efficiency without introducing extra training costs, +many research works are proposed to reduce the visual tokens by filtering the +uninformative visual tokens or aggregating their information. Some approaches +propose to reduce the visual tokens according to the self-attention of VLMs, +which are biased, to result in inaccurate responses. The token reduction +approaches solely rely on visual cues are text-agnostic, and fail to focus on +the areas that are most relevant to the question, especially when the queried +objects are non-salient to the image. In this work, we first conduct +experiments to show that the original text embeddings are aligned with the +visual tokens, without bias on the tailed visual tokens. We then propose a +self-adaptive cross-modality attention mixture mechanism that dynamically +leverages the effectiveness of visual saliency and text-to-image similarity in +the pre-LLM layers to select the visual tokens that are informative. Extensive +experiments demonstrate that the proposed approach achieves state-of-the-art +training-free VLM acceleration performance, especially when the reduction rate +is sufficiently large. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ HydraMix: Multi-Image Feature Mixing for Small Data Image Classification + + +
+ Training deep neural networks requires datasets with a large number of +annotated examples. The collection and annotation of these datasets is not only +extremely expensive but also faces legal and privacy problems. These factors +are a significant limitation for many real-world applications. To address this, +we introduce HydraMix, a novel architecture that generates new image +compositions by mixing multiple different images from the same class. HydraMix +learns the fusion of the content of various images guided by a +segmentation-based mixing mask in feature space and is optimized via a +combination of unsupervised and adversarial training. Our data augmentation +scheme allows the creation of models trained from scratch on very small +datasets. We conduct extensive experiments on ciFAIR-10, STL-10, and +ciFAIR-100. Additionally, we introduce a novel text-image metric to assess the +generality of the augmented datasets. Our results show that HydraMix +outperforms existing state-of-the-art methods for image classification on small +datasets. + +
+
+
+
+
+ + ☆ AnyStory: Towards Unified Single and Multiple Subject Personalization in + Text-to-Image Generation + + +
+ Recently, large-scale generative models have demonstrated outstanding +text-to-image generation capabilities. However, generating high-fidelity +personalized images with specific subjects still presents challenges, +especially in cases involving multiple subjects. In this paper, we propose +AnyStory, a unified approach for personalized subject generation. AnyStory not +only achieves high-fidelity personalization for single subjects, but also for +multiple subjects, without sacrificing subject fidelity. Specifically, AnyStory +models the subject personalization problem in an "encode-then-route" manner. In +the encoding step, AnyStory utilizes a universal and powerful image encoder, +i.e., ReferenceNet, in conjunction with CLIP vision encoder to achieve +high-fidelity encoding of subject features. In the routing step, AnyStory +utilizes a decoupled instance-aware subject router to accurately perceive and +predict the potential location of the corresponding subject in the latent +space, and guide the injection of subject conditions. Detailed experimental +results demonstrate the excellent performance of our method in retaining +subject details, aligning text descriptions, and personalizing for multiple +subjects. The project page is at https://aigcdesigngroup.github.io/AnyStory/ . + +
+
+ comment: Tech report; Project page: + https://aigcdesigngroup.github.io/AnyStory/ +
+
+
+
+
+ + ☆ Omni-Emotion: Extending Video MLLM with Detailed Face and Audio Modeling + for Multimodal Emotion Analysis + + +
+ Understanding emotions accurately is essential for fields like human-computer +interaction. Due to the complexity of emotions and their multi-modal nature +(e.g., emotions are influenced by facial expressions and audio), researchers +have turned to using multi-modal models to understand human emotions rather +than single-modality. However, current video multi-modal large language models +(MLLMs) encounter difficulties in effectively integrating audio and identifying +subtle facial micro-expressions. Furthermore, the lack of detailed emotion +analysis datasets also limits the development of multimodal emotion analysis. +To address these issues, we introduce a self-reviewed dataset and a +human-reviewed dataset, comprising 24,137 coarse-grained samples and 3,500 +manually annotated samples with detailed emotion annotations, respectively. +These datasets allow models to learn from diverse scenarios and better +generalize to real-world applications. Moreover, in addition to the audio +modeling, we propose to explicitly integrate facial encoding models into the +existing advanced Video MLLM, enabling the MLLM to effectively unify audio and +the subtle facial cues for emotion understanding. By aligning these features +within a unified space and employing instruction tuning in our proposed +datasets, our Omni-Emotion achieves state-of-the-art performance in both +emotion recognition and reasoning tasks. + +
+
+
+
+
+ + ☆ VanGogh: A Unified Multimodal Diffusion-based Framework for Video + Colorization + + +
+ Video colorization aims to transform grayscale videos into vivid color +representations while maintaining temporal consistency and structural +integrity. Existing video colorization methods often suffer from color bleeding +and lack comprehensive control, particularly under complex motion or diverse +semantic cues. To this end, we introduce VanGogh, a unified multimodal +diffusion-based framework for video colorization. VanGogh tackles these +challenges using a Dual Qformer to align and fuse features from multiple +modalities, complemented by a depth-guided generation process and an optical +flow loss, which help reduce color overflow. Additionally, a color injection +strategy and luma channel replacement are implemented to improve generalization +and mitigate flickering artifacts. Thanks to this design, users can exercise +both global and local control over the generation process, resulting in +higher-quality colorized videos. Extensive qualitative and quantitative +evaluations, and user studies, demonstrate that VanGogh achieves superior +temporal consistency and color fidelity.Project page: +https://becauseimbatman0.github.io/VanGogh. + +
+
+
+
+
+ + ☆ Comparison of Various SLAM Systems for Mobile Robot in an Indoor + Environment + + +
+ This article presents a comparative analysis of a mobile robot trajectories +computed by various ROS-based SLAM systems. For this reason we developed a +prototype of a mobile robot with common sensors: 2D lidar, a monocular and ZED +stereo cameras. Then we conducted experiments in a typical office environment +and collected data from all sensors, running all tested SLAM systems based on +the acquired dataset. We studied the following SLAM systems: (a) 2D +lidar-based: GMapping, Hector SLAM, Cartographer; (b) monocular camera-based: +Large Scale Direct monocular SLAM (LSD SLAM), ORB SLAM, Direct Sparse Odometry +(DSO); and (c) stereo camera-based: ZEDfu, Real-Time Appearance-Based Mapping +(RTAB map), ORB SLAM, Stereo Parallel Tracking and Mapping (S-PTAM). Since all +SLAM methods were tested on the same dataset we compared results for different +SLAM systems with appropriate metrics, demonstrating encouraging results for +lidar-based Cartographer SLAM, Monocular ORB SLAM and Stereo RTAB Map methods. + +
+
+ comment: 6 pages, 6 figures +
+
+
+
+
+ + ☆ The Devil is in the Details: Simple Remedies for Image-to-LiDAR + Representation Learning + + +
+ LiDAR is a crucial sensor in autonomous driving, commonly used alongside +cameras. By exploiting this camera-LiDAR setup and recent advances in image +representation learning, prior studies have shown the promising potential of +image-to-LiDAR distillation. These prior arts focus on the designs of their own +losses to effectively distill the pre-trained 2D image representations into a +3D model. However, the other parts of the designs have been surprisingly +unexplored. We find that fundamental design elements, e.g., the LiDAR +coordinate system, quantization according to the existing input interface, and +data utilization, are more critical than developing loss functions, which have +been overlooked in prior works. In this work, we show that simple fixes to +these designs notably outperform existing methods by 16% in 3D semantic +segmentation on the nuScenes dataset and 13% in 3D object detection on the +KITTI dataset in downstream task performance. We focus on overlooked design +choices along the spatial and temporal axes. Spatially, prior work has used +cylindrical coordinate and voxel sizes without considering their side effects +yielded with a commonly deployed sparse convolution layer input interface, +leading to spatial quantization errors in 3D models. Temporally, existing work +has avoided cumbersome data curation by discarding unsynced data, limiting the +use to only the small portion of data that is temporally synced across sensors. +We analyze these effects and propose simple solutions for each overlooked +aspect. + +
+
+ comment: Accepted to ACCV2024 +
+
+
+
+
+ + ☆ MonoSOWA: Scalable monocular 3D Object detector Without human + Annotations + + +
+ Detecting the three-dimensional position and orientation of objects using a +single RGB camera is a foundational task in computer vision with many important +applications. Traditionally, 3D object detection methods are trained in a +fully-supervised setup, requiring vast amounts of human annotations, which are +laborious, costly, and do not scale well with the ever-increasing amounts of +data being captured. + In this paper, we present the first method to train 3D object detectors for +monocular RGB cameras without domain-specific human annotations, thus making +orders of magnitude more data available for training. Thanks to newly proposed +Canonical Object Space, the method can not only exploit data across a variety +of datasets and camera setups to train a single 3D detector, but unlike +previous work it also works out of the box in previously unseen camera setups. +All this is crucial for practical applications, where the data and cameras are +extremely heterogeneous. + The method is evaluated on two standard autonomous driving datasets, where it +outperforms previous works, which, unlike our method, still rely on 2D human +annotations. + +
+
+
+
+
+ + ☆ DEFOM-Stereo: Depth Foundation Model Based Stereo Matching + + +
+ Stereo matching is a key technique for metric depth estimation in computer +vision and robotics. Real-world challenges like occlusion and non-texture +hinder accurate disparity estimation from binocular matching cues. Recently, +monocular relative depth estimation has shown remarkable generalization using +vision foundation models. Thus, to facilitate robust stereo matching with +monocular depth cues, we incorporate a robust monocular relative depth model +into the recurrent stereo-matching framework, building a new framework for +depth foundation model-based stereo-matching, DEFOM-Stereo. In the feature +extraction stage, we construct the combined context and matching feature +encoder by integrating features from conventional CNNs and DEFOM. In the update +stage, we use the depth predicted by DEFOM to initialize the recurrent +disparity and introduce a scale update module to refine the disparity at the +correct scale. DEFOM-Stereo is verified to have comparable performance on the +Scene Flow dataset with state-of-the-art (SOTA) methods and notably shows much +stronger zero-shot generalization. Moreover, DEFOM-Stereo achieves SOTA +performance on the KITTI 2012, KITTI 2015, Middlebury, and ETH3D benchmarks, +ranking 1st on many metrics. In the joint evaluation under the robust vision +challenge, our model simultaneously outperforms previous models on the +individual benchmarks. Both results demonstrate the outstanding capabilities of +the proposed model. + +
+
+ comment: Code: https://github.com/Insta360-Research-Team/DEFOM-Stereo +
+
+
+
+
+ + ☆ RE-POSE: Synergizing Reinforcement Learning-Based Partitioning and + Offloading for Edge Object Detection + + +
+ Object detection plays a crucial role in smart video analysis, with +applications ranging from autonomous driving and security to smart cities. +However, achieving real-time object detection on edge devices presents +significant challenges due to their limited computational resources and the +high demands of deep neural network (DNN)-based detection models, particularly +when processing high-resolution video. Conventional strategies, such as input +down-sampling and network up-scaling, often compromise detection accuracy for +faster performance or lead to higher inference latency. To address these +issues, this paper introduces RE-POSE, a Reinforcement Learning (RL)-Driven +Partitioning and Edge Offloading framework designed to optimize the +accuracy-latency trade-off in resource-constrained edge environments. Our +approach features an RL-Based Dynamic Clustering Algorithm (RL-DCA) that +partitions video frames into non-uniform blocks based on object distribution +and the computational characteristics of DNNs. Furthermore, a parallel edge +offloading scheme is implemented to distribute these blocks across multiple +edge servers for concurrent processing. Experimental evaluations show that +RE-POSE significantly enhances detection accuracy and reduces inference +latency, surpassing existing methods. + +
+
+
+
+
+ + ☆ Normal-NeRF: Ambiguity-Robust Normal Estimation for Highly Reflective + Scenes + + +
+ Neural Radiance Fields (NeRF) often struggle with reconstructing and +rendering highly reflective scenes. Recent advancements have developed various +reflection-aware appearance models to enhance NeRF's capability to render +specular reflections. However, the robust reconstruction of highly reflective +scenes is still hindered by the inherent shape ambiguity on specular surfaces. +Existing methods typically rely on additional geometry priors to regularize the +shape prediction, but this can lead to oversmoothed geometry in complex scenes. +Observing the critical role of surface normals in parameterizing reflections, +we introduce a transmittance-gradient-based normal estimation technique that +remains robust even under ambiguous shape conditions. Furthermore, we propose a +dual activated densities module that effectively bridges the gap between smooth +surface normals and sharp object boundaries. Combined with a reflection-aware +appearance model, our proposed method achieves robust reconstruction and +high-fidelity rendering of scenes featuring both highly specular reflections +and intricate geometric structures. Extensive experiments demonstrate that our +method outperforms existing state-of-the-art methods on various datasets. + +
+
+ comment: AAAI 2025, code available at https://github.com/sjj118/Normal-NeRF +
+
+
+
+
+ + ☆ On the Relation between Optical Aperture and Automotive Object Detection + + +
+ We explore the impact of aperture size and shape on automotive camera systems +for deep-learning-based tasks like traffic sign recognition and light state +detection. A method is proposed to simulate optical effects using the point +spread function (PSF), enhancing realism and reducing the domain gap between +synthetic and real-world images. Computer-generated scenes are refined with +this technique to model optical distortions and improve simulation accuracy. + +
+
+
+
+
+ + ☆ Double Visual Defense: Adversarial Pre-training and Instruction Tuning + for Improving Vision-Language Model Robustness + + +
+ This paper investigates the robustness of vision-language models against +adversarial visual perturbations and introduces a novel ``double visual +defense" to enhance this robustness. Unlike previous approaches that resort to +lightweight adversarial fine-tuning of a pre-trained CLIP model, we perform +large-scale adversarial vision-language pre-training from scratch using +web-scale data. We then strengthen the defense by incorporating adversarial +visual instruction tuning. The resulting models from each stage, $\Delta$CLIP +and $\Delta^2$LLaVA, show substantially enhanced zero-shot robustness and set a +new state-of-the-art in adversarial defense for vision-language models. For +example, the adversarial robustness of $\Delta$CLIP surpasses that of the +previous best models on ImageNet-1k by ~20%. %For example, $\Delta$CLIP +surpasses the previous best models on ImageNet-1k by ~20% in terms of +adversarial robustness. Similarly, compared to prior art, $\Delta^2$LLaVA +brings a ~30% robustness improvement to image captioning task and a ~20% +robustness improvement to visual question answering task. Furthermore, our +models exhibit stronger zero-shot recognition capability, fewer hallucinations, +and superior reasoning performance compared to baselines. Our project page is +https://doublevisualdefense.github.io/. + +
+
+
+
+
+ + ☆ Scaling up self-supervised learning for improved surgical foundation + models + + +
+ Foundation models have revolutionized computer vision by achieving vastly +superior performance across diverse tasks through large-scale pretraining on +extensive datasets. However, their application in surgical computer vision has +been limited. This study addresses this gap by introducing SurgeNetXL, a novel +surgical foundation model that sets a new benchmark in surgical computer +vision. Trained on the largest reported surgical dataset to date, comprising +over 4.7 million video frames, SurgeNetXL achieves consistent top-tier +performance across six datasets spanning four surgical procedures and three +tasks, including semantic segmentation, phase recognition, and critical view of +safety (CVS) classification. Compared with the best-performing surgical +foundation models, SurgeNetXL shows mean improvements of 2.4, 9.0, and 12.6 +percent for semantic segmentation, phase recognition, and CVS classification, +respectively. Additionally, SurgeNetXL outperforms the best-performing +ImageNet-based variants by 14.4, 4.0, and 1.6 percent in the respective tasks. +In addition to advancing model performance, this study provides key insights +into scaling pretraining datasets, extending training durations, and optimizing +model architectures specifically for surgical computer vision. These findings +pave the way for improved generalizability and robustness in data-scarce +scenarios, offering a comprehensive framework for future research in this +domain. All models and a subset of the SurgeNetXL dataset, including over 2 +million video frames, are publicly available at: +https://github.com/TimJaspers0801/SurgeNet. + +
+
+
+
+
+ + ☆ CaPa: Carve-n-Paint Synthesis for Efficient 4K Textured Mesh Generation + + +
+ The synthesis of high-quality 3D assets from textual or visual inputs has +become a central objective in modern generative modeling. Despite the +proliferation of 3D generation algorithms, they frequently grapple with +challenges such as multi-view inconsistency, slow generation times, low +fidelity, and surface reconstruction problems. While some studies have +addressed some of these issues, a comprehensive solution remains elusive. In +this paper, we introduce \textbf{CaPa}, a carve-and-paint framework that +generates high-fidelity 3D assets efficiently. CaPa employs a two-stage +process, decoupling geometry generation from texture synthesis. Initially, a 3D +latent diffusion model generates geometry guided by multi-view inputs, ensuring +structural consistency across perspectives. Subsequently, leveraging a novel, +model-agnostic Spatially Decoupled Attention, the framework synthesizes +high-resolution textures (up to 4K) for a given geometry. Furthermore, we +propose a 3D-aware occlusion inpainting algorithm that fills untextured +regions, resulting in cohesive results across the entire model. This pipeline +generates high-quality 3D assets in less than 30 seconds, providing +ready-to-use outputs for commercial applications. Experimental results +demonstrate that CaPa excels in both texture fidelity and geometric stability, +establishing a new standard for practical, scalable 3D asset generation. + +
+
+ comment: project page: https://ncsoft.github.io/CaPa/ +
+
+
+
+
+ + ☆ AugRefer: Advancing 3D Visual Grounding via Cross-Modal Augmentation and + Spatial Relation-based Referring + + +
+ 3D visual grounding (3DVG), which aims to correlate a natural language +description with the target object within a 3D scene, is a significant yet +challenging task. Despite recent advancements in this domain, existing +approaches commonly encounter a shortage: a limited amount and diversity of +text3D pairs available for training. Moreover, they fall short in effectively +leveraging different contextual clues (e.g., rich spatial relations within the +3D visual space) for grounding. To address these limitations, we propose +AugRefer, a novel approach for advancing 3D visual grounding. AugRefer +introduces cross-modal augmentation designed to extensively generate diverse +text-3D pairs by placing objects into 3D scenes and creating accurate and +semantically rich descriptions using foundation models. Notably, the resulting +pairs can be utilized by any existing 3DVG methods for enriching their training +data. Additionally, AugRefer presents a language-spatial adaptive decoder that +effectively adapts the potential referring objects based on the language +description and various 3D spatial relations. Extensive experiments on three +benchmark datasets clearly validate the effectiveness of AugRefer. + +
+
+ comment: AAAI 2025 +
+
+
+
+
+ + ☆ Vision-Language Models Do Not Understand Negation + + +
+ Many practical vision-language applications require models that understand +negation, e.g., when using natural language to retrieve images which contain +certain objects but not others. Despite advancements in vision-language models +(VLMs) through large-scale training, their ability to comprehend negation +remains underexplored. This study addresses the question: how well do current +VLMs understand negation? We introduce NegBench, a new benchmark designed to +evaluate negation understanding across 18 task variations and 79k examples +spanning image, video, and medical datasets. The benchmark consists of two core +tasks designed to evaluate negation understanding in diverse multimodal +settings: Retrieval with Negation and Multiple Choice Questions with Negated +Captions. Our evaluation reveals that modern VLMs struggle significantly with +negation, often performing at chance level. To address these shortcomings, we +explore a data-centric approach wherein we finetune CLIP models on large-scale +synthetic datasets containing millions of negated captions. We show that this +approach can result in a 10% increase in recall on negated queries and a 40% +boost in accuracy on multiple-choice questions with negated captions. + +
+
+ comment: Project page: https://negbench.github.io +
+
+
+
+
+ + ☆ Dynamic Neural Style Transfer for Artistic Image Generation using VGG19 + + +
+ Throughout history, humans have created remarkable works of art, but +artificial intelligence has only recently started to make strides in generating +visually compelling art. Breakthroughs in the past few years have focused on +using convolutional neural networks (CNNs) to separate and manipulate the +content and style of images, applying texture synthesis techniques. +Nevertheless, a number of current techniques continue to encounter obstacles, +including lengthy processing times, restricted choices of style images, and the +inability to modify the weight ratio of styles. We proposed a neural style +transfer system that can add various artistic styles to a desired image to +address these constraints allowing flexible adjustments to style weight ratios +and reducing processing time. The system uses the VGG19 model for feature +extraction, ensuring high-quality, flexible stylization without compromising +content integrity. + +
+
+
+
+
+ + ☆ Towards Robust and Realistic Human Pose Estimation via WiFi Signals + + +
+ Robust WiFi-based human pose estimation is a challenging task that bridges +discrete and subtle WiFi signals to human skeletons. This paper revisits this +problem and reveals two critical yet overlooked issues: 1) cross-domain gap, +i.e., due to significant variations between source-target domain pose +distributions; and 2) structural fidelity gap, i.e., predicted skeletal poses +manifest distorted topology, usually with misplaced joints and disproportionate +bone lengths. This paper fills these gaps by reformulating the task into a +novel two-phase framework dubbed DT-Pose: Domain-consistent representation +learning and Topology-constrained Pose decoding. Concretely, we first propose a +temporal-consistent contrastive learning strategy with uniformity +regularization, coupled with self-supervised masking-reconstruction operations, +to enable robust learning of domain-consistent and motion-discriminative +WiFi-specific representations. Beyond this, we introduce a simple yet effective +pose decoder with task prompts, which integrates Graph Convolution Network +(GCN) and Transformer layers to constrain the topology structure of the +generated skeleton by exploring the adjacent-overarching relationships among +human joints. Extensive experiments conducted on various benchmark datasets +highlight the superior performance of our method in tackling these fundamental +challenges in both 2D/3D human pose estimation tasks. + +
+
+ comment: 15 pages, 9 figures +
+
+
+
+
+ + ☆ PISCO: Self-Supervised k-Space Regularization for Improved Neural + Implicit k-Space Representations of Dynamic MRI + + +
+ Neural implicit k-space representations (NIK) have shown promising results +for dynamic magnetic resonance imaging (MRI) at high temporal resolutions. Yet, +reducing acquisition time, and thereby available training data, results in +severe performance drops due to overfitting. To address this, we introduce a +novel self-supervised k-space loss function $\mathcal{L}_\mathrm{PISCO}$, +applicable for regularization of NIK-based reconstructions. The proposed loss +function is based on the concept of parallel imaging-inspired self-consistency +(PISCO), enforcing a consistent global k-space neighborhood relationship +without requiring additional data. Quantitative and qualitative evaluations on +static and dynamic MR reconstructions show that integrating PISCO significantly +improves NIK representations. Particularly for high acceleration factors +(R$\geq$54), NIK with PISCO achieves superior spatio-temporal reconstruction +quality compared to state-of-the-art methods. Furthermore, an extensive +analysis of the loss assumptions and stability shows PISCO's potential as +versatile self-supervised k-space loss function for further applications and +architectures. Code is available at: +https://github.com/compai-lab/2025-pisco-spieker + +
+
+
+
+
+ + ☆ Joint Transmission and Deblurring: A Semantic Communication Approach + Using Events + + +
+ Deep learning-based joint source-channel coding (JSCC) is emerging as a +promising technology for effective image transmission. However, most existing +approaches focus on transmitting clear images, overlooking real-world +challenges such as motion blur caused by camera shaking or fast-moving objects. +Motion blur often degrades image quality, making transmission and +reconstruction more challenging. Event cameras, which asynchronously record +pixel intensity changes with extremely low latency, have shown great potential +for motion deblurring tasks. However, the efficient transmission of the +abundant data generated by event cameras remains a significant challenge. In +this work, we propose a novel JSCC framework for the joint transmission of +blurry images and events, aimed at achieving high-quality reconstructions under +limited channel bandwidth. This approach is designed as a deblurring +task-oriented JSCC system. Since RGB cameras and event cameras capture the same +scene through different modalities, their outputs contain both shared and +domain-specific information. To avoid repeatedly transmitting the shared +information, we extract and transmit their shared information and +domain-specific information, respectively. At the receiver, the received +signals are processed by a deblurring decoder to generate clear images. +Additionally, we introduce a multi-stage training strategy to train the +proposed model. Simulation results demonstrate that our method significantly +outperforms existing JSCC-based image transmission schemes, addressing motion +blur effectively. + +
+
+
+
+
+ + ☆ SVIA: A Street View Image Anonymization Framework for Self-Driving + Applications + + +
+ In recent years, there has been an increasing interest in image +anonymization, particularly focusing on the de-identification of faces and +individuals. However, for self-driving applications, merely de-identifying +faces and individuals might not provide sufficient privacy protection since +street views like vehicles and buildings can still disclose locations, +trajectories, and other sensitive information. Therefore, it remains crucial to +extend anonymization techniques to street view images to fully preserve the +privacy of users, pedestrians, and vehicles. In this paper, we propose a Street +View Image Anonymization (SVIA) framework for self-driving applications. The +SVIA framework consists of three integral components: a semantic segmenter to +segment an input image into functional regions, an inpainter to generate +alternatives to privacy-sensitive regions, and a harmonizer to seamlessly +stitch modified regions to guarantee visual coherence. Compared to existing +methods, SVIA achieves a much better trade-off between image generation quality +and privacy protection, as evidenced by experimental results for five common +metrics on two widely used public datasets. + +
+
+ comment: 8 pages, 6 figures, 3 tables. Accepted by IEEE ITSC 2024 +
+
+
+
+
+ + ☆ Image Segmentation with transformers: An Overview, Challenges and Future + + +
+ Image segmentation, a key task in computer vision, has traditionally relied +on convolutional neural networks (CNNs), yet these models struggle with +capturing complex spatial dependencies, objects with varying scales, need for +manually crafted architecture components and contextual information. This paper +explores the shortcomings of CNN-based models and the shift towards transformer +architectures -to overcome those limitations. This work reviews +state-of-the-art transformer-based segmentation models, addressing +segmentation-specific challenges and their solutions. The paper discusses +current challenges in transformer-based segmentation and outlines promising +future trends, such as lightweight architectures and enhanced data efficiency. +This survey serves as a guide for understanding the impact of transformers in +advancing segmentation capabilities and overcoming the limitations of +traditional models. + +
+
+
+
+
+ + ☆ Identification of Traditional Medicinal Plant Leaves Using an effective + Deep Learning model and Self-Curated Dataset + + +
+ Medicinal plants have been a key component in producing traditional and +modern medicines, especially in the field of Ayurveda, an ancient Indian +medical system. Producing these medicines and collecting and extracting the +right plant is a crucial step due to the visually similar nature of some +plants. The extraction of these plants from nonmedicinal plants requires human +expert intervention. To solve the issue of accurate plant identification and +reduce the need for a human expert in the collection process; employing +computer vision methods will be efficient and beneficial. In this paper, we +have proposed a model that solves such issues. The proposed model is a custom +convolutional neural network (CNN) architecture with 6 convolution layers, +max-pooling layers, and dense layers. The model was tested on three different +datasets named Indian Medicinal Leaves Image Dataset,MED117 Medicinal Plant +Leaf Dataset, and the self-curated dataset by the authors. The proposed model +achieved respective accuracies of 99.5%, 98.4%, and 99.7% using various +optimizers including Adam, RMSprop, and SGD with momentum. + +
+
+
+
+
+ + ☆ Strategic Base Representation Learning via Feature Augmentations for + Few-Shot Class Incremental Learning + + +
+ Few-shot class incremental learning implies the model to learn new classes +while retaining knowledge of previously learned classes with a small number of +training instances. Existing frameworks typically freeze the parameters of the +previously learned classes during the incorporation of new classes. However, +this approach often results in suboptimal class separation of previously +learned classes, leading to overlap between old and new classes. Consequently, +the performance of old classes degrades on new classes. To address these +challenges, we propose a novel feature augmentation driven contrastive learning +framework designed to enhance the separation of previously learned classes to +accommodate new classes. Our approach involves augmenting feature vectors and +assigning proxy labels to these vectors. This strategy expands the feature +space, ensuring seamless integration of new classes within the expanded space. +Additionally, we employ a self-supervised contrastive loss to improve the +separation between previous classes. We validate our framework through +experiments on three FSCIL benchmark datasets: CIFAR100, miniImageNet, and +CUB200. The results demonstrate that our Feature Augmentation driven +Contrastive Learning framework significantly outperforms other approaches, +achieving state-of-the-art performance. + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ☆ YETI (YET to Intervene) Proactive Interventions by Multimodal AI Agents + in Augmented Reality Tasks + + +
+ Multimodal AI Agents are AI models that have the capability of interactively +and cooperatively assisting human users to solve day-to-day tasks. Augmented +Reality (AR) head worn devices can uniquely improve the user experience of +solving procedural day-to-day tasks by providing egocentric multimodal (audio +and video) observational capabilities to AI Agents. Such AR capabilities can +help AI Agents see and listen to actions that users take which can relate to +multimodal capabilities of human users. Existing AI Agents, either Large +Language Models (LLMs) or Multimodal Vision-Language Models (VLMs) are reactive +in nature, which means that models cannot take an action without reading or +listening to the human user's prompts. Proactivity of AI Agents on the other +hand can help the human user detect and correct any mistakes in agent observed +tasks, encourage users when they do tasks correctly or simply engage in +conversation with the user - akin to a human teaching or assisting a user. Our +proposed YET to Intervene (YETI) multimodal agent focuses on the research +question of identifying circumstances that may require the agent to intervene +proactively. This allows the agent to understand when it can intervene in a +conversation with human users that can help the user correct mistakes on tasks, +like cooking, using AR. Our YETI Agent learns scene understanding signals based +on interpretable notions of Structural Similarity (SSIM) on consecutive video +frames. We also define the alignment signal which the AI Agent can learn to +identify if the video frames corresponding to the user's actions on the task +are consistent with expected actions. These signals are used by our AI Agent to +determine when it should proactively intervene. We compare our results on the +instances of proactive intervention in the HoloAssist multimodal benchmark for +an expert agent guiding a user to complete procedural tasks. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Making Your Dreams A Reality: Decoding the Dreams into a Coherent Video + Story from fMRI Signals + + +
+ This paper studies the brave new idea for Multimedia community, and proposes +a novel framework to convert dreams into coherent video narratives using fMRI +data. Essentially, dreams have intrigued humanity for centuries, offering +glimpses into our subconscious minds. Recent advancements in brain imaging, +particularly functional magnetic resonance imaging (fMRI), have provided new +ways to explore the neural basis of dreaming. By combining subjective dream +experiences with objective neurophysiological data, we aim to understand the +visual aspects of dreams and create complete video narratives. Our process +involves three main steps: reconstructing visual perception, decoding dream +imagery, and integrating dream stories. Using innovative techniques in fMRI +analysis and language modeling, we seek to push the boundaries of dream +research and gain deeper insights into visual experiences during sleep. This +technical report introduces a novel approach to visually decoding dreams using +fMRI signals and weaving dream visuals into narratives using language models. +We gather a dataset of dreams along with descriptions to assess the +effectiveness of our framework. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ UVRM: A Scalable 3D Reconstruction Model from Unposed Videos + + +
+ Large Reconstruction Models (LRMs) have recently become a popular method for +creating 3D foundational models. Training 3D reconstruction models with 2D +visual data traditionally requires prior knowledge of camera poses for the +training samples, a process that is both time-consuming and prone to errors. +Consequently, 3D reconstruction training has been confined to either synthetic +3D datasets or small-scale datasets with annotated poses. In this study, we +investigate the feasibility of 3D reconstruction using unposed video data of +various objects. We introduce UVRM, a novel 3D reconstruction model capable of +being trained and evaluated on monocular videos without requiring any +information about the pose. UVRM uses a transformer network to implicitly +aggregate video frames into a pose-invariant latent feature space, which is +then decoded into a tri-plane 3D representation. To obviate the need for +ground-truth pose annotations during training, UVRM employs a combination of +the score distillation sampling (SDS) method and an analysis-by-synthesis +approach, progressively synthesizing pseudo novel-views using a pre-trained +diffusion model. We qualitatively and quantitatively evaluate UVRM's +performance on the G-Objaverse and CO3D datasets without relying on pose +information. Extensive experiments show that UVRM is capable of effectively and +efficiently reconstructing a wide range of 3D objects from unposed videos. + +
+
+
+
+
+ + ☆ SE-BSFV: Online Subspace Learning based Shadow Enhancement and + Background Suppression for ViSAR under Complex Background + + +
+ Video synthetic aperture radar (ViSAR) has attracted substantial attention in +the moving target detection (MTD) field due to its ability to continuously +monitor changes in the target area. In ViSAR, the moving targets' shadows will +not offset and defocus, which is widely used as a feature for MTD. However, the +shadows are difficult to distinguish from the low scattering region in the +background, which will cause more missing and false alarms. Therefore, it is +worth investigating how to enhance the distinction between the shadows and +background. In this study, we proposed the Shadow Enhancement and Background +Suppression for ViSAR (SE-BSFV) algorithm. The SE-BSFV algorithm is based on +the low-rank representation (LRR) theory and adopts online subspace learning +technique to enhance shadows and suppress background for ViSAR images. Firstly, +we use a registration algorithm to register the ViSAR images and utilize +Gaussian mixture distribution (GMD) to model the ViSAR data. Secondly, the +knowledge learned from the previous frames is leveraged to estimate the GMD +parameters of the current frame, and the Expectation-maximization (EM) +algorithm is used to estimate the subspace parameters. Then, the foreground +matrix of the current frame can be obtained. Finally, the alternating direction +method of multipliers (ADMM) is used to eliminate strong scattering objects in +the foreground matrix to obtain the final results. The experimental results +indicate that the SE-BSFV algorithm significantly enhances the shadows' +saliency and greatly improves the detection performance while ensuring +efficiency compared with several other advanced pre-processing algorithms. + +
+
+
+
+
+ + ☆ Prompt-CAM: A Simpler Interpretable Transformer for Fine-Grained + Analysis + + +
+ We present a simple usage of pre-trained Vision Transformers (ViTs) for +fine-grained analysis, aiming to identify and localize the traits that +distinguish visually similar categories, such as different bird species or dog +breeds. Pre-trained ViTs such as DINO have shown remarkable capabilities to +extract localized, informative features. However, using saliency maps like +Grad-CAM can hardly point out the traits: they often locate the whole object by +a blurred, coarse heatmap, not traits. We propose a novel approach Prompt Class +Attention Map (Prompt-CAM) to the rescue. Prompt-CAM learns class-specific +prompts to a pre-trained ViT and uses the corresponding outputs for +classification. To classify an image correctly, the true-class prompt must +attend to the unique image patches not seen in other classes' images, i.e., +traits. As such, the true class's multi-head attention maps reveal traits and +their locations. Implementation-wise, Prompt-CAM is almost a free lunch by +simply modifying the prediction head of Visual Prompt Tuning (VPT). This makes +Prompt-CAM fairly easy to train and apply, sharply contrasting other +interpretable methods that design specific models and training processes. It is +even simpler than the recently published INterpretable TRansformer (INTR), +whose encoder-decoder architecture prevents it from leveraging pre-trained +ViTs. Extensive empirical studies on a dozen datasets from various domains +(e.g., birds, fishes, insects, fungi, flowers, food, and cars) validate +Prompt-CAM superior interpretation capability. + +
+
+
+
+
+ + ☆ Soft Knowledge Distillation with Multi-Dimensional Cross-Net Attention + for Image Restoration Models Compression + + +
+ Transformer-based encoder-decoder models have achieved remarkable success in +image-to-image transfer tasks, particularly in image restoration. However, +their high computational complexity-manifested in elevated FLOPs and parameter +counts-limits their application in real-world scenarios. Existing knowledge +distillation methods in image restoration typically employ lightweight student +models that directly mimic the intermediate features and reconstruction results +of the teacher, overlooking the implicit attention relationships between them. +To address this, we propose a Soft Knowledge Distillation (SKD) strategy that +incorporates a Multi-dimensional Cross-net Attention (MCA) mechanism for +compressing image restoration models. This mechanism facilitates interaction +between the student and teacher across both channel and spatial dimensions, +enabling the student to implicitly learn the attention matrices. Additionally, +we employ a Gaussian kernel function to measure the distance between student +and teacher features in kernel space, ensuring stable and efficient feature +learning. To further enhance the quality of reconstructed images, we replace +the commonly used L1 or KL divergence loss with a contrastive learning loss at +the image level. Experiments on three tasks-image deraining, deblurring, and +denoising-demonstrate that our SKD strategy significantly reduces computational +complexity while maintaining strong image restoration capabilities. + +
+
+ comment: Accepted by ICASSP2025 +
+
+
+
+
+ + ☆ Shape-Based Single Object Classification Using Ensemble Method + Classifiers + + +
+ Nowadays, more and more images are available. Annotation and retrieval of the +images pose classification problems, where each class is defined as the group +of database images labelled with a common semantic label. Various systems have +been proposed for content-based retrieval, as well as for image classification +and indexing. In this paper, a hierarchical classification framework has been +proposed for bridging the semantic gap effectively and achieving multi-category +image classification. A well known pre-processing and post-processing method +was used and applied to three problems; image segmentation, object +identification and image classification. The method was applied to classify +single object images from Amazon and Google datasets. The classification was +tested for four different classifiers; BayesNetwork (BN), Random Forest (RF), +Bagging and Vote. The estimated classification accuracies ranged from 20% to +99% (using 10-fold cross validation). The Bagging classifier presents the best +performance, followed by the Random Forest classifier. + +
+
+
+
+
+ + ☆ Domain-conditioned and Temporal-guided Diffusion Modeling for + Accelerated Dynamic MRI Reconstruction + + +
+ Purpose: To propose a domain-conditioned and temporal-guided diffusion +modeling method, termed dynamic Diffusion Modeling (dDiMo), for accelerated +dynamic MRI reconstruction, enabling diffusion process to characterize +spatiotemporal information for time-resolved multi-coil Cartesian and +non-Cartesian data. Methods: The dDiMo framework integrates temporal +information from time-resolved dimensions, allowing for the concurrent capture +of intra-frame spatial features and inter-frame temporal dynamics in diffusion +modeling. It employs additional spatiotemporal ($x$-$t$) and self-consistent +frequency-temporal ($k$-$t$) priors to guide the diffusion process. This +approach ensures precise temporal alignment and enhances the recovery of fine +image details. To facilitate a smooth diffusion process, the nonlinear +conjugate gradient algorithm is utilized during the reverse diffusion steps. +The proposed model was tested on two types of MRI data: Cartesian-acquired +multi-coil cardiac MRI and Golden-Angle-Radial-acquired multi-coil +free-breathing lung MRI, across various undersampling rates. Results: dDiMo +achieved high-quality reconstructions at various acceleration factors, +demonstrating improved temporal alignment and structural recovery compared to +other competitive reconstruction methods, both qualitatively and +quantitatively. This proposed diffusion framework exhibited robust performance +in handling both Cartesian and non-Cartesian acquisitions, effectively +reconstructing dynamic datasets in cardiac and lung MRI under different imaging +conditions. Conclusion: This study introduces a novel diffusion modeling method +for dynamic MRI reconstruction. + +
+
+ comment: 21 pages, 15 figures, 2 tables +
+
+
+
+
+ + ☆ Finding the Trigger: Causal Abductive Reasoning on Video Events + + +
+ This paper introduces a new problem, Causal Abductive Reasoning on Video +Events (CARVE), which involves identifying causal relationships between events +in a video and generating hypotheses about causal chains that account for the +occurrence of a target event. To facilitate research in this direction, we +create two new benchmark datasets with both synthetic and realistic videos, +accompanied by trigger-target labels generated through a novel counterfactual +synthesis approach. To explore the challenge of solving CARVE, we present a +Causal Event Relation Network (CERN) that examines the relationships between +video events in temporal and semantic spaces to efficiently determine the +root-cause trigger events. Through extensive experiments, we demonstrate the +critical roles of event relational representation learning and interaction +modeling in solving video causal reasoning challenges. The introduction of the +CARVE task, along with the accompanying datasets and the CERN framework, will +advance future research on video causal reasoning and significantly facilitate +various applications, including video surveillance, root-cause analysis and +movie content management. + +
+
+
+
+
+ + ☆ Creating Virtual Environments with 3D Gaussian Splatting: A Comparative + Study + + +
+ 3D Gaussian Splatting (3DGS) has recently emerged as an innovative and +efficient 3D representation technique. While its potential for extended reality +(XR) applications is frequently highlighted, its practical effectiveness +remains underexplored. In this work, we examine three distinct 3DGS-based +approaches for virtual environment (VE) creation, leveraging their unique +strengths for efficient and visually compelling scene representation. By +conducting a comparable study, we evaluate the feasibility of 3DGS in creating +immersive VEs, identify its limitations in XR applications, and discuss future +research and development opportunities. + +
+
+ comment: IEEE VR 2025 Posters +
+
+
+
+
+ + ☆ Efficient Few-Shot Medical Image Analysis via Hierarchical Contrastive + Vision-Language Learning + + +
+ Few-shot learning in medical image classification presents a significant +challenge due to the limited availability of annotated data and the complex +nature of medical imagery. In this work, we propose Adaptive Vision-Language +Fine-tuning with Hierarchical Contrastive Alignment (HiCA), a novel framework +that leverages the capabilities of Large Vision-Language Models (LVLMs) for +medical image analysis. HiCA introduces a two-stage fine-tuning strategy, +combining domain-specific pretraining and hierarchical contrastive learning to +align visual and textual representations at multiple levels. We evaluate our +approach on two benchmark datasets, Chest X-ray and Breast Ultrasound, +achieving state-of-the-art performance in both few-shot and zero-shot settings. +Further analyses demonstrate the robustness, generalizability, and +interpretability of our method, with substantial improvements in performance +compared to existing baselines. Our work highlights the potential of +hierarchical contrastive strategies in adapting LVLMs to the unique challenges +of medical imaging tasks. + +
+
+
+
+
+ + ☆ SoccerSynth-Detection: A Synthetic Dataset for Soccer Player Detection + + +
+ In soccer video analysis, player detection is essential for identifying key +events and reconstructing tactical positions. The presence of numerous players +and frequent occlusions, combined with copyright restrictions, severely +restricts the availability of datasets, leaving limited options such as +SoccerNet-Tracking and SportsMOT. These datasets suffer from a lack of +diversity, which hinders algorithms from adapting effectively to varied soccer +video contexts. To address these challenges, we developed +SoccerSynth-Detection, the first synthetic dataset designed for the detection +of synthetic soccer players. It includes a broad range of random lighting and +textures, as well as simulated camera motion blur. We validated its efficacy +using the object detection model (Yolov8n) against real-world datasets +(SoccerNet-Tracking and SportsMoT). In transfer tests, it matched the +performance of real datasets and significantly outperformed them in images with +motion blur; in pre-training tests, it demonstrated its efficacy as a +pre-training dataset, significantly enhancing the algorithm's overall +performance. Our work demonstrates the potential of synthetic datasets to +replace real datasets for algorithm training in the field of soccer video +analysis. + +
+
+
+
+
+ + ☆ Text-guided Synthetic Geometric Augmentation for Zero-shot 3D + Understanding CVPR + + +
+ Zero-shot recognition models require extensive training data for +generalization. However, in zero-shot 3D classification, collecting 3D data and +captions is costly and laborintensive, posing a significant barrier compared to +2D vision. Recent advances in generative models have achieved unprecedented +realism in synthetic data production, and recent research shows the potential +for using generated data as training data. Here, naturally raising the +question: Can synthetic 3D data generated by generative models be used as +expanding limited 3D datasets? In response, we present a synthetic 3D dataset +expansion method, Textguided Geometric Augmentation (TeGA). TeGA is tailored +for language-image-3D pretraining, which achieves SoTA in zero-shot 3D +classification, and uses a generative textto-3D model to enhance and extend +limited 3D datasets. Specifically, we automatically generate text-guided +synthetic 3D data and introduce a consistency filtering strategy to discard +noisy samples where semantics and geometric shapes do not match with text. In +the experiment to double the original dataset size using TeGA, our approach +demonstrates improvements over the baselines, achieving zeroshot performance +gains of 3.0% on Objaverse-LVIS, 4.6% on ScanObjectNN, and 8.7% on ModelNet40. +These results demonstrate that TeGA effectively bridges the 3D data gap, +enabling robust zero-shot 3D classification even with limited real training +data and paving the way for zero-shot 3D vision application. + +
+
+ comment: 14 pages, 8 figures, this paper is submitted to CVPR +
+
+
+
+
+ + ☆ Bias for Action: Video Implicit Neural Representations with Bias + Modulation + + +
+ We propose a new continuous video modeling framework based on implicit neural +representations (INRs) called ActINR. At the core of our approach is the +observation that INRs can be considered as a learnable dictionary, with the +shapes of the basis functions governed by the weights of the INR, and their +locations governed by the biases. Given compact non-linear activation +functions, we hypothesize that an INR's biases are suitable to capture motion +across images, and facilitate compact representations for video sequences. +Using these observations, we design ActINR to share INR weights across frames +of a video sequence, while using unique biases for each frame. We further model +the biases as the output of a separate INR conditioned on time index to promote +smoothness. By training the video INR and this bias INR together, we +demonstrate unique capabilities, including $10\times$ video slow motion, +$4\times$ spatial super resolution along with $2\times$ slow motion, denoising, +and video inpainting. ActINR performs remarkably well across numerous video +processing tasks (often achieving more than 6dB improvement), setting a new +standard for continuous modeling of videos. + +
+
+
+
+
+ + ☆ Knowledge Distillation for Image Restoration : Simultaneous Learning + from Degraded and Clean Images + + +
+ Model compression through knowledge distillation has seen extensive +application in classification and segmentation tasks. However, its potential in +image-to-image translation, particularly in image restoration, remains +underexplored. To address this gap, we propose a Simultaneous Learning +Knowledge Distillation (SLKD) framework tailored for model compression in image +restoration tasks. SLKD employs a dual-teacher, single-student architecture +with two distinct learning strategies: Degradation Removal Learning (DRL) and +Image Reconstruction Learning (IRL), simultaneously. In DRL, the student +encoder learns from Teacher A to focus on removing degradation factors, guided +by a novel BRISQUE extractor. In IRL, the student decoder learns from Teacher B +to reconstruct clean images, with the assistance of a proposed PIQE extractor. +These strategies enable the student to learn from degraded and clean images +simultaneously, ensuring high-quality compression of image restoration models. +Experimental results across five datasets and three tasks demonstrate that SLKD +achieves substantial reductions in FLOPs and parameters, exceeding 80\%, while +maintaining strong image restoration performance. + +
+
+ comment: Accepted by ICASSP2025 +
+
+
+
+
+ + ☆ Are Open-Vocabulary Models Ready for Detection of MEP Elements on + Construction Sites + + +
+ The construction industry has long explored robotics and computer vision, yet +their deployment on construction sites remains very limited. These technologies +have the potential to revolutionize traditional workflows by enhancing +accuracy, efficiency, and safety in construction management. Ground robots +equipped with advanced vision systems could automate tasks such as monitoring +mechanical, electrical, and plumbing (MEP) systems. The present research +evaluates the applicability of open-vocabulary vision-language models compared +to fine-tuned, lightweight, closed-set object detectors for detecting MEP +components using a mobile ground robotic platform. A dataset collected with +cameras mounted on a ground robot was manually annotated and analyzed to +compare model performance. The results demonstrate that, despite the +versatility of vision-language models, fine-tuned lightweight models still +largely outperform them in specialized environments and for domain-specific +tasks. + +
+
+ comment: 4 pages, 3 figures +
+
+
+
+
+ + ☆ OpticFusion: Multi-Modal Neural Implicit 3D Reconstruction of + Microstructures by Fusing White Light Interferometry and Optical Microscopy 3DV 2025 + + +
+ White Light Interferometry (WLI) is a precise optical tool for measuring the +3D topography of microstructures. However, conventional WLI cannot capture the +natural color of a sample's surface, which is essential for many microscale +research applications that require both 3D geometry and color information. +Previous methods have attempted to overcome this limitation by modifying WLI +hardware and analysis software, but these solutions are often costly. In this +work, we address this challenge from a computer vision multi-modal +reconstruction perspective for the first time. We introduce OpticFusion, a +novel approach that uses an additional digital optical microscope (OM) to +achieve 3D reconstruction with natural color textures using multi-view WLI and +OM images. Our method employs a two-step data association process to obtain the +poses of WLI and OM data. By leveraging the neural implicit representation, we +fuse multi-modal data and apply color decomposition technology to extract the +sample's natural color. Tested on our multi-modal dataset of various microscale +samples, OpticFusion achieves detailed 3D reconstructions with color textures. +Our method provides an effective tool for practical applications across +numerous microscale research fields. The source code and our real-world dataset +are available at https://github.com/zju3dv/OpticFusion. + +
+
+ comment: 3DV 2025 +
+
+
+
+
+ + ☆ Leveraging Scale-aware Representations for improved + Concept-Representation Alignment in ViTs + + +
+ Vision Transformers (ViTs) are increasingly being adopted in various +sensitive vision applications - like medical diagnosis, facial recognition, +etc. To improve the interpretability of such models, many approaches attempt to +forward-align them with carefully annotated abstract, human-understandable +semantic entities - concepts. Concepts provide global rationales to the model +predictions and can be quickly understood/intervened on by domain experts. Most +current research focuses on designing model-agnostic, plug-and-play generic +concept-based explainability modules that do not incorporate the inner workings +of foundation models (e.g., inductive biases, scale invariance, etc.) during +training. To alleviate this issue for ViTs, in this paper, we propose a novel +Concept Representation Alignment Module (CRAM) which learns both scale and +position-aware representations from multi-scale feature pyramids and patch +representations respectively. CRAM further aligns these representations with +concept annotations through an attention matrix. The proposed CRAM module +improves the predictive performance of ViT architectures and also provides +accurate and robust concept explanations as demonstrated on five datasets - +including three widely used benchmarks (CUB, Pascal APY, Concept-MNIST) and 2 +real-world datasets (AWA2, KITS). + +
+
+
+
+
+ + ☆ Adaptive Law-Based Transformation (ALT): A Lightweight Feature + Representation for Time Series Classification + + +
+ Time series classification (TSC) is fundamental in numerous domains, +including finance, healthcare, and environmental monitoring. However, +traditional TSC methods often struggle with the inherent complexity and +variability of time series data. Building on our previous work with the linear +law-based transformation (LLT) - which improved classification accuracy by +transforming the feature space based on key data patterns - we introduce +adaptive law-based transformation (ALT). ALT enhances LLT by incorporating +variable-length shifted time windows, enabling it to capture distinguishing +patterns of various lengths and thereby handle complex time series more +effectively. By mapping features into a linearly separable space, ALT provides +a fast, robust, and transparent solution that achieves state-of-the-art +performance with only a few hyperparameters. + +
+
+ comment: 8 pages, 1 figure, 5 tables +
+
+
+
+
+ + ☆ Surgical Visual Understanding (SurgVU) Dataset + + +
+ Owing to recent advances in machine learning and the ability to harvest large +amounts of data during robotic-assisted surgeries, surgical data science is +ripe for foundational work. We present a large dataset of surgical videos and +their accompanying labels for this purpose. We describe how the data was +collected and some of its unique attributes. Multiple example problems are +outlined. Although the dataset was curated for a particular set of scientific +challenges (in an accompanying paper), it is general enough to be used for a +broad range machine learning questions. Our hope is that this dataset exposes +the larger machine learning community to the challenging problems within +surgical data science, and becomes a touchstone for future research. The videos +are available at +https://storage.googleapis.com/isi-surgvu/surgvu24_videos_only.zip, the labels +at https://storage.googleapis.com/isi-surgvu/surgvu24_labels_updated_v2.zip, +and a validation set for tool detection problem at +https://storage.googleapis.com/isi-surgvu/cat1_test_set_public.zip. + +
+
+
+
+
+ + ♻ ☆ FutureDepth: Learning to Predict the Future Improves Video Depth + Estimation ECCV 2024 + + +
+ In this paper, we propose a novel video depth estimation approach, +FutureDepth, which enables the model to implicitly leverage multi-frame and +motion cues to improve depth estimation by making it learn to predict the +future at training. More specifically, we propose a future prediction network, +F-Net, which takes the features of multiple consecutive frames and is trained +to predict multi-frame features one time step ahead iteratively. In this way, +F-Net learns the underlying motion and correspondence information, and we +incorporate its features into the depth decoding process. Additionally, to +enrich the learning of multiframe correspondence cues, we further leverage a +reconstruction network, R-Net, which is trained via adaptively masked +auto-encoding of multiframe feature volumes. At inference time, both F-Net and +R-Net are used to produce queries to work with the depth decoder, as well as a +final refinement network. Through extensive experiments on several benchmarks, +i.e., NYUDv2, KITTI, DDAD, and Sintel, which cover indoor, driving, and +open-domain scenarios, we show that FutureDepth significantly improves upon +baseline models, outperforms existing video depth estimation methods, and sets +new state-of-the-art (SOTA) accuracy. Furthermore, FutureDepth is more +efficient than existing SOTA video depth estimation models and has similar +latencies when comparing to monocular models + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ MAMo: Leveraging Memory and Attention for Monocular Video Depth + Estimation ICCV 2023 + + +
+ We propose MAMo, a novel memory and attention frame-work for monocular video +depth estimation. MAMo can augment and improve any single-image depth +estimation networks into video depth estimation models, enabling them to take +advantage of the temporal information to predict more accurate depth. In MAMo, +we augment model with memory which aids the depth prediction as the model +streams through the video. Specifically, the memory stores learned visual and +displacement tokens of the previous time instances. This allows the depth +network to cross-reference relevant features from the past when predicting +depth on the current frame. We introduce a novel scheme to continuously update +the memory, optimizing it to keep tokens that correspond with both the past and +the present visual information. We adopt attention-based approach to process +memory features where we first learn the spatio-temporal relation among the +resultant visual and displacement memory tokens using self-attention module. +Further, the output features of self-attention are aggregated with the current +visual features through cross-attention. The cross-attended features are +finally given to a decoder to predict depth on the current frame. Through +extensive experiments on several benchmarks, including KITTI, NYU-Depth V2, and +DDAD, we show that MAMo consistently improves monocular depth estimation +networks and sets new state-of-the-art (SOTA) accuracy. Notably, our MAMo video +depth estimation provides higher accuracy with lower latency, when omparing to +SOTA cost-volume-based video depth models. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Vulnerability-Aware Spatio-Temporal Learning for Generalizable and + Interpretable Deepfake Video Detection + + +
+ Detecting deepfake videos is highly challenging due to the complex +intertwined spatial and temporal artifacts in forged sequences. Most recent +approaches rely on binary classifiers trained on both real and fake data. +However, such methods may struggle to focus on important artifacts, which can +hinder their generalization capability. Additionally, these models often lack +interpretability, making it difficult to understand how predictions are made. +To address these issues, we propose FakeSTormer, offering two key +contributions. First, we introduce a multi-task learning framework with +additional spatial and temporal branches that enable the model to focus on +subtle spatio-temporal artifacts. These branches also provide interpretability +by highlighting video regions that may contain artifacts. Second, we propose a +video-level data synthesis algorithm that generates pseudo-fake videos with +subtle artifacts, providing the model with high-quality samples and ground +truth data for our spatial and temporal branches. Extensive experiments on +several challenging benchmarks demonstrate the competitiveness of our approach +compared to recent state-of-the-art methods. The code is available at +https://github.com/10Ring/FakeSTormer. + +
+
+
+
+
+ + ♻ ☆ Super-class guided Transformer for Zero-Shot Attribute Classification + + +
+ Attribute classification is crucial for identifying specific characteristics +within image regions. Vision-Language Models (VLMs) have been effective in +zero-shot tasks by leveraging their general knowledge from large-scale +datasets. Recent studies demonstrate that transformer-based models with +class-wise queries can effectively address zero-shot multi-label +classification. However, poor utilization of the relationship between seen and +unseen attributes makes the model lack generalizability. Additionally, +attribute classification generally involves many attributes, making maintaining +the model's scalability difficult. To address these issues, we propose +Super-class guided transFormer (SugaFormer), a novel framework that leverages +super-classes to enhance scalability and generalizability for zero-shot +attribute classification. SugaFormer employs Super-class Query Initialization +(SQI) to reduce the number of queries, utilizing common semantic information +from super-classes, and incorporates Multi-context Decoding (MD) to handle +diverse visual cues. To strengthen generalizability, we introduce two knowledge +transfer strategies that utilize VLMs. During training, Super-class guided +Consistency Regularization (SCR) aligns model's features with VLMs using +super-class guided prompts, and during inference, Zero-shot Retrieval-based +Score Enhancement (ZRSE) refines predictions for unseen attributes. Extensive +experiments demonstrate that SugaFormer achieves state-of-the-art performance +across three widely-used attribute classification benchmarks under zero-shot, +and cross-dataset transfer settings. Our code is available at +https://github.com/mlvlab/SugaFormer. + +
+
+ comment: AAAI25 +
+
+
+
+
+ + ♻ ☆ VIS-MAE: An Efficient Self-supervised Learning Approach on Medical Image + Segmentation and Classification + + +
+ Artificial Intelligence (AI) has the potential to revolutionize diagnosis and +segmentation in medical imaging. However, development and clinical +implementation face multiple challenges including limited data availability, +lack of generalizability, and the necessity to incorporate multi-modal data +effectively. A foundation model, which is a large-scale pre-trained AI model, +offers a versatile base that can be adapted to a variety of specific tasks and +contexts. Here, we present VIsualization and Segmentation Masked AutoEncoder +(VIS-MAE), novel model weights specifically designed for medical imaging. +Specifically, VIS-MAE is trained on a dataset of 2.5 million unlabeled images +from various modalities (CT, MR, PET,X-rays, and ultrasound), using +self-supervised learning techniques. It is then adapted to classification and +segmentation tasks using explicit labels. VIS-MAE has high label efficiency, +outperforming several benchmark models in both in-domain and out-of-domain +applications. In addition, VIS-MAE has improved label efficiency as it can +achieve similar performance to other models with a reduced amount of labeled +training data (50% or 80%) compared to other pre-trained weights. VIS-MAE +represents a significant advancement in medical imaging AI, offering a +generalizable and robust solution for improving segmentation and classification +tasks while reducing the data annotation workload. The source code of this work +is available at https://github.com/lzl199704/VIS-MAE. + +
+
+
+
+
+ + ♻ ☆ A Comparative Study on Multi-task Uncertainty Quantification in Semantic + Segmentation and Monocular Depth Estimation + + +
+ Deep neural networks excel in perception tasks such as semantic segmentation +and monocular depth estimation, making them indispensable in safety-critical +applications like autonomous driving and industrial inspection. However, they +often suffer from overconfidence and poor explainability, especially for +out-of-domain data. While uncertainty quantification has emerged as a promising +solution to these challenges, multi-task settings have yet to be explored. In +an effort to shed light on this, we evaluate Monte Carlo Dropout, Deep +Sub-Ensembles, and Deep Ensembles for joint semantic segmentation and monocular +depth estimation. Thereby, we reveal that Deep Ensembles stand out as the +preferred choice, particularly in out-of-domain scenarios, and show the +potential benefit of multi-task learning with regard to the uncertainty quality +in comparison to solving both tasks separately. Additionally, we highlight the +impact of employing different uncertainty thresholds to classify pixels as +certain or uncertain, with the median uncertainty emerging as a robust default. + +
+
+ comment: This manuscript is an extended version of a previously published + conference paper and is currently in review for a journal +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey of Foundation Models in Medicine + + +
+ Foundation models (FMs) are large-scale deep learning models trained on +massive datasets, often using self-supervised learning techniques. These models +serve as a versatile base for a wide range of downstream tasks, including those +in medicine and healthcare. FMs have demonstrated remarkable success across +multiple healthcare domains. However, existing surveys in this field do not +comprehensively cover all areas where FMs have made significant strides. In +this survey, we present a comprehensive review of FMs in medicine, focusing on +their evolution, learning strategies, flagship models, applications, and +associated challenges. We examine how prominent FMs, such as the BERT and GPT +families, are transforming various aspects of healthcare, including clinical +large language models, medical image analysis, and omics research. +Additionally, we provide a detailed taxonomy of FM-enabled healthcare +applications, spanning clinical natural language processing, medical computer +vision, graph learning, and other biology- and omics- related tasks. Despite +the transformative potentials of FMs, they also pose unique challenges. This +survey delves into these challenges and highlights open research questions and +lessons learned to guide researchers and practitioners. Our goal is to provide +valuable insights into the capabilities of FMs in health, facilitating +responsible deployment and mitigating associated risks. + +
+
+ comment: Currently under review in IEEE REVIEWS IN BIOMEDICAL ENGINEERING +
+
+
+
+
+ + ♻ ☆ Improving Zero-Shot Object-Level Change Detection by Incorporating + Visual Correspondence + + +
+ Detecting object-level changes between two images across possibly different +views is a core task in many applications that involve visual inspection or +camera surveillance. Existing change-detection approaches suffer from three +major limitations: (1) lack of evaluation on image pairs that contain no +changes, leading to unreported false positive rates; (2) lack of +correspondences (i.e., localizing the regions before and after a change); and +(3) poor zero-shot generalization across different domains. To address these +issues, we introduce a novel method that leverages change correspondences (a) +during training to improve change detection accuracy, and (b) at test time, to +minimize false positives. That is, we harness the supervision labels of where +an object is added or removed to supervise change detectors, improving their +accuracy over previous work by a large margin. Our work is also the first to +predict correspondences between pairs of detected changes using estimated +homography and the Hungarian algorithm. Our model demonstrates superior +performance over existing methods, achieving state-of-the-art results in change +detection and change correspondence accuracy across both in-distribution and +zero-shot benchmarks. + +
+
+
+
+
+ + ♻ ☆ MECD+: Unlocking Event-Level Causal Graph Discovery for Video Reasoning TPAMI + + +
+ Video causal reasoning aims to achieve a high-level understanding of videos +from a causal perspective. However, it exhibits limitations in its scope, +primarily executed in a question-answering paradigm and focusing on brief video +segments containing isolated events and basic causal relations, lacking +comprehensive and structured causality analysis for videos with multiple +interconnected events. To fill this gap, we introduce a new task and dataset, +Multi-Event Causal Discovery (MECD). It aims to uncover the causal relations +between events distributed chronologically across long videos. Given visual +segments and textual descriptions of events, MECD identifies the causal +associations between these events to derive a comprehensive and structured +event-level video causal graph explaining why and how the result event +occurred. To address the challenges of MECD, we devise a novel framework +inspired by the Granger Causality method, incorporating an efficient mask-based +event prediction model to perform an Event Granger Test. It estimates causality +by comparing the predicted result event when premise events are masked versus +unmasked. Furthermore, we integrate causal inference techniques such as +front-door adjustment and counterfactual inference to mitigate challenges in +MECD like causality confounding and illusory causality. Additionally, context +chain reasoning is introduced to conduct more robust and generalized reasoning. +Experiments validate the effectiveness of our framework in reasoning complete +causal relations, outperforming GPT-4o and VideoChat2 by 5.77% and 2.70%, +respectively. Further experiments demonstrate that causal relation graphs can +also contribute to downstream video understanding tasks such as video question +answering and video event prediction. + +
+
+ comment: IEEE TPAMI Submission. continuous work of arXiv:2409.17647 (NeurIPS + 2024) +
+
+
+
+
+ + ♻ ☆ VITA-1.5: Towards GPT-4o Level Real-Time Vision and Speech Interaction + + +
+ Recent Multimodal Large Language Models (MLLMs) have typically focused on +integrating visual and textual modalities, with less emphasis placed on the +role of speech in enhancing interaction. However, speech plays a crucial role +in multimodal dialogue systems, and implementing high-performance in both +vision and speech tasks remains a significant challenge due to the fundamental +modality differences. In this paper, we propose a carefully designed +multi-stage training methodology that progressively trains LLM to understand +both visual and speech information, ultimately enabling fluent vision and +speech interaction. Our approach not only preserves strong vision-language +capacity, but also enables efficient speech-to-speech dialogue capabilities +without separate ASR and TTS modules, significantly accelerating multimodal +end-to-end response speed. By comparing our method against state-of-the-art +counterparts across benchmarks for image, video, and speech tasks, we +demonstrate that our model is equipped with both strong visual and speech +capabilities, making near real-time vision and speech interaction. + +
+
+ comment: https://github.com/VITA-MLLM/VITA +
+
+
+
+
+ + ♻ ☆ Bayesian Low-Rank LeArning (Bella): A Practical Approach to Bayesian + Neural Networks + + +
+ Computational complexity of Bayesian learning is impeding its adoption in +practical, large-scale tasks. Despite demonstrations of significant merits such +as improved robustness and resilience to unseen or out-of-distribution inputs +over their non- Bayesian counterparts, their practical use has faded to near +insignificance. In this study, we introduce an innovative framework to mitigate +the computational burden of Bayesian neural networks (BNNs). Our approach +follows the principle of Bayesian techniques based on deep ensembles, but +significantly reduces their cost via multiple low-rank perturbations of +parameters arising from a pre-trained neural network. Both vanilla version of +ensembles as well as more sophisticated schemes such as Bayesian learning with +Stein Variational Gradient Descent (SVGD), previously deemed impractical for +large models, can be seamlessly implemented within the proposed framework, +called Bayesian Low-Rank LeArning (Bella). In a nutshell, i) Bella achieves a +dramatic reduction in the number of trainable parameters required to +approximate a Bayesian posterior; and ii) it not only maintains, but in some +instances, surpasses the performance of conventional Bayesian learning methods +and non-Bayesian baselines. Our results with large-scale tasks such as +ImageNet, CAMELYON17, DomainNet, VQA with CLIP, LLaVA demonstrate the +effectiveness and versatility of Bella in building highly scalable and +practical Bayesian deep models for real-world applications. + +
+
+ comment: This paper is accepted in AAAI'2025 +
+
+
+
+
+ + ♻ ☆ Latent Space Characterization of Autoencoder Variants + + +
+ Understanding the latent spaces learned by deep learning models is crucial in +exploring how they represent and generate complex data. Autoencoders (AEs) have +played a key role in the area of representation learning, with numerous +regularization techniques and training principles developed not only to enhance +their ability to learn compact and robust representations, but also to reveal +how different architectures influence the structure and smoothness of the +lower-dimensional non-linear manifold. We strive to characterize the structure +of the latent spaces learned by different autoencoders including convolutional +autoencoders (CAEs), denoising autoencoders (DAEs), and variational +autoencoders (VAEs) and how they change with the perturbations in the input. By +characterizing the matrix manifolds corresponding to the latent spaces, we +provide an explanation for the well-known observation that the latent spaces of +CAE and DAE form non-smooth manifolds, while that of VAE forms a smooth +manifold. We also map the points of the matrix manifold to a Hilbert space +using distance preserving transforms and provide an alternate view in terms of +the subspaces generated in the Hilbert space as a function of the distortion in +the input. The results show that the latent manifolds of CAE and DAE are +stratified with each stratum being a smooth product manifold, while the +manifold of VAE is a smooth product manifold of two symmetric positive definite +matrices and a symmetric positive semi-definite matrix. + +
+
+ comment: 9 pages, 6 figures, and 1 table +
+
+
+
+
+ + ♻ ☆ STROOBnet Optimization via GPU-Accelerated Proximal Recurrence + Strategies + + +
+ Spatiotemporal networks' observational capabilities are crucial for accurate +data gathering and informed decisions across multiple sectors. This study +focuses on the Spatiotemporal Ranged Observer-Observable Bipartite Network +(STROOBnet), linking observational nodes (e.g., surveillance cameras) to events +within defined geographical regions, enabling efficient monitoring. Using data +from Real-Time Crime Camera (RTCC) systems and Calls for Service (CFS) in New +Orleans, where RTCC combats rising crime amidst reduced police presence, we +address the network's initial observational imbalances. Aiming for uniform +observational efficacy, we propose the Proximal Recurrence approach. It +outperformed traditional clustering methods like k-means and DBSCAN by offering +holistic event frequency and spatial consideration, enhancing observational +coverage. + +
+
+ comment: 10 pages, 17 figures, 2023 IEEE International Conference on Big Data + (BigData) +
+
+
+
+
+ + ♻ ☆ Enhancing Few-Shot Image Classification through Learnable Multi-Scale + Embedding and Attention Mechanisms + + +
+ In the context of few-shot classification, the goal is to train a classifier +using a limited number of samples while maintaining satisfactory performance. +However, traditional metric-based methods exhibit certain limitations in +achieving this objective. These methods typically rely on a single distance +value between the query feature and support feature, thereby overlooking the +contribution of shallow features. To overcome this challenge, we propose a +novel approach in this paper. Our approach involves utilizing a multi-output +embedding network that maps samples into distinct feature spaces. The proposed +method extracts feature vectors at different stages, enabling the model to +capture both global and abstract features. By utilizing these diverse feature +spaces, our model enhances its performance. Moreover, employing a +self-attention mechanism improves the refinement of features at each stage, +leading to even more robust representations and improved overall performance. +Furthermore, assigning learnable weights to each stage significantly improved +performance and results. We conducted comprehensive evaluations on the +MiniImageNet and FC100 datasets, specifically in the 5-way 1-shot and 5-way +5-shot scenarios. Additionally, we performed cross-domain tasks across eight +benchmark datasets, achieving high accuracy in the testing domains. These +evaluations demonstrate the efficacy of our proposed method in comparison to +state-of-the-art approaches. https://github.com/FatemehAskari/MSENet + +
+
+
+
+
+ + ♻ ☆ A Multi-Modal Approach for Face Anti-Spoofing in Non-Calibrated Systems + using Disparity Maps + + +
+ Face recognition technologies are increasingly used in various applications, +yet they are vulnerable to face spoofing attacks. These spoofing attacks often +involve unique 3D structures, such as printed papers or mobile device screens. +Although stereo-depth cameras can detect such attacks effectively, their +high-cost limits their widespread adoption. Conversely, two-sensor systems +without extrinsic calibration offer a cost-effective alternative but are unable +to calculate depth using stereo techniques. In this work, we propose a method +to overcome this challenge by leveraging facial attributes to derive disparity +information and estimate relative depth for anti-spoofing purposes, using +non-calibrated systems. We introduce a multi-modal anti-spoofing model, coined +Disparity Model, that incorporates created disparity maps as a third modality +alongside the two original sensor modalities. We demonstrate the effectiveness +of the Disparity Model in countering various spoof attacks using a +comprehensive dataset collected from the Intel RealSense ID Solution F455. Our +method outperformed existing methods in the literature, achieving an Equal +Error Rate (EER) of 1.71% and a False Negative Rate (FNR) of 2.77% at a False +Positive Rate (FPR) of 1%. These errors are lower by 2.45% and 7.94% than the +errors of the best comparison method, respectively. Additionally, we introduce +a model ensemble that addresses 3D spoof attacks as well, achieving an EER of +2.04% and an FNR of 3.83% at an FPR of 1%. Overall, our work provides a +state-of-the-art solution for the challenging task of anti-spoofing in +non-calibrated systems that lack depth information. + +
+
+
+
+
+ + ♻ ☆ Evaluating alignment between humans and neural network representations + in image-based learning tasks + + +
+ Humans represent scenes and objects in rich feature spaces, carrying +information that allows us to generalise about category memberships and +abstract functions with few examples. What determines whether a neural network +model generalises like a human? We tested how well the representations of $86$ +pretrained neural network models mapped to human learning trajectories across +two tasks where humans had to learn continuous relationships and categories of +natural images. In these tasks, both human participants and neural networks +successfully identified the relevant stimulus features within a few trials, +demonstrating effective generalisation. We found that while training dataset +size was a core determinant of alignment with human choices, contrastive +training with multi-modal data (text and imagery) was a common feature of +currently publicly available models that predicted human generalisation. +Intrinsic dimensionality of representations had different effects on alignment +for different model types. Lastly, we tested three sets of human-aligned +representations and found no consistent improvements in predictive accuracy +compared to the baselines. In conclusion, pretrained neural networks can serve +to extract representations for cognitive models, as they appear to capture some +fundamental aspects of cognition that are transferable across tasks. Both our +paradigms and modelling approach offer a novel way to quantify alignment +between neural networks and humans and extend cognitive science into more +naturalistic domains. + +
+
+
+
+
+ + ♻ ☆ Instruction-Guided Fusion of Multi-Layer Visual Features in Large + Vision-Language Models + + +
+ Large Vision-Language Models (LVLMs) have achieved significant success in +multimodal tasks by combining pre-trained vision encoders and large language +models. However, current LVLMs mainly rely on features from the final layers of +the vision encoder, neglecting complementary information in shallower layers. +While recent methods have explored multi-layer features, they are often +task-agnostic. We investigate the contributions of visual features from +different encoder layers across 18 benchmarks and 6 task categories. Our +results show that multi-layer features provide complementary strengths with +varying task dependencies, and uniform fusion performs suboptimally. Based on +these findings, we propose an instruction-guided vision aggregator that +dynamically integrates multi-layer features based on textual instructions, +without increasing the number of visual tokens. Extensive evaluations show +superior performance, and analysis reveals the dominance of mid-to-high-level +features in semantic tasks and the critical role of low-level features in +fine-grained perception. This work provides valuable insights into the adaptive +use of hierarchical visual features in LVLMs, advancing more flexible +multimodal systems. + +
+
+
+
+
+ + ♻ ☆ Diffusion Models in Vision: A Survey + + +
+ Denoising diffusion models represent a recent emerging topic in computer +vision, demonstrating remarkable results in the area of generative modeling. A +diffusion model is a deep generative model that is based on two stages, a +forward diffusion stage and a reverse diffusion stage. In the forward diffusion +stage, the input data is gradually perturbed over several steps by adding +Gaussian noise. In the reverse stage, a model is tasked at recovering the +original input data by learning to gradually reverse the diffusion process, +step by step. Diffusion models are widely appreciated for the quality and +diversity of the generated samples, despite their known computational burdens, +i.e. low speeds due to the high number of steps involved during sampling. In +this survey, we provide a comprehensive review of articles on denoising +diffusion models applied in vision, comprising both theoretical and practical +contributions in the field. First, we identify and present three generic +diffusion modeling frameworks, which are based on denoising diffusion +probabilistic models, noise conditioned score networks, and stochastic +differential equations. We further discuss the relations between diffusion +models and other deep generative models, including variational auto-encoders, +generative adversarial networks, energy-based models, autoregressive models and +normalizing flows. Then, we introduce a multi-perspective categorization of +diffusion models applied in computer vision. Finally, we illustrate the current +limitations of diffusion models and envision some interesting directions for +future research. + +
+
+ comment: Accepted in IEEE Transactions on Pattern Analysis and Machine + Intelligence. 25 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ DriveLM: Driving with Graph Visual Question Answering ECCV 2024 + + +
+ We study how vision-language models (VLMs) trained on web-scale data can be +integrated into end-to-end driving systems to boost generalization and enable +interactivity with human users. While recent approaches adapt VLMs to driving +via single-round visual question answering (VQA), human drivers reason about +decisions in multiple steps. Starting from the localization of key objects, +humans estimate object interactions before taking actions. The key insight is +that with our proposed task, Graph VQA, where we model graph-structured +reasoning through perception, prediction and planning question-answer pairs, we +obtain a suitable proxy task to mimic the human reasoning process. We +instantiate datasets (DriveLM-Data) built upon nuScenes and CARLA, and propose +a VLM-based baseline approach (DriveLM-Agent) for jointly performing Graph VQA +and end-to-end driving. The experiments demonstrate that Graph VQA provides a +simple, principled framework for reasoning about a driving scene, and +DriveLM-Data provides a challenging benchmark for this task. Our DriveLM-Agent +baseline performs end-to-end autonomous driving competitively in comparison to +state-of-the-art driving-specific architectures. Notably, its benefits are +pronounced when it is evaluated zero-shot on unseen objects or sensor +configurations. We hope this work can be the starting point to shed new light +on how to apply VLMs for autonomous driving. To facilitate future research, all +code, data, and models are available to the public. + +
+
+ comment: Accepted to ECCV 2024 as Oral paper +
+
+
+
+
+ + ♻ ☆ Towards an End-to-End (E2E) Adversarial Learning and Application in the + Physical World + + +
+ The traditional learning process of patch-based adversarial attacks, +conducted in the digital domain and then applied in the physical domain (e.g., +via printed stickers), may suffer from reduced performance due to adversarial +patches' limited transferability from the digital domain to the physical +domain. Given that previous studies have considered using projectors to apply +adversarial attacks, we raise the following question: can adversarial learning +(i.e., patch generation) be performed entirely in the physical domain with a +projector? In this work, we propose the Physical-domain Adversarial Patch +Learning Augmentation (PAPLA) framework, a novel end-to-end (E2E) framework +that converts adversarial learning from the digital domain to the physical +domain using a projector. We evaluate PAPLA across multiple scenarios, +including controlled laboratory settings and realistic outdoor environments, +demonstrating its ability to ensure attack success compared to conventional +digital learning-physical application (DL-PA) methods. We also analyze the +impact of environmental factors, such as projection surface color, projector +strength, ambient light, distance, and angle of the target object relative to +the camera, on the effectiveness of projected patches. Finally, we demonstrate +the feasibility of the attack against a parked car and a stop sign in a +real-world outdoor environment. Our results show that under specific +conditions, E2E adversarial learning in the physical domain eliminates the +transferability issue and ensures evasion by object detectors. Finally, we +provide insights into the challenges and opportunities of applying adversarial +learning in the physical domain and explain where such an approach is more +effective than using a sticker. + +
+
+
+
+
+ + ♻ ☆ TextureCrop: Enhancing Synthetic Image Detection through Texture-based + Cropping + + +
+ Generative AI technologies produce increasingly realistic imagery, which, +despite its potential for creative applications, can also be misused to produce +misleading and harmful content. This renders Synthetic Image Detection (SID) +methods essential for identifying AI-generated content online. State-of-the-art +SID methods typically resize or center-crop input images due to architectural +or computational constraints, which hampers the detection of artifacts that +appear in high-resolution images. To address this limitation, we propose +TextureCrop, an image pre-processing component that can be plugged in any +pre-trained SID model to improve its performance. By focusing on high-frequency +image parts where generative artifacts are prevalent, TextureCrop enhances SID +performance with manageable memory requirements. Experimental results +demonstrate a consistent improvement in AUC across various detectors by 6.1% +compared to center cropping and by 15% compared to resizing, across +high-resolution images from the Forensynths, Synthbuster and TWIGMA datasets. +Code available at https : //github.com/mever-team/texture-crop. + +
+
+ comment: 10 pages, 7 images +
+
+
+
+
+ + ♻ ☆ IOR: Inversed Objects Replay for Incremental Object Detection + + +
+ Existing Incremental Object Detection (IOD) methods partially alleviate +catastrophic forgetting when incrementally detecting new objects in real-world +scenarios. However, many of these methods rely on the assumption that unlabeled +old-class objects may co-occur with labeled new-class objects in the +incremental data. When unlabeled old-class objects are absent, the performance +of existing methods tends to degrade. The absence can be mitigated by +generating old-class samples, but it incurs high costs. This paper argues that +previous generation-based IOD suffers from redundancy, both in the use of +generative models, which require additional training and storage, and in the +overproduction of generated samples, many of which do not contribute +significantly to performance improvements. To eliminate the redundancy, we +propose Inversed Objects Replay (IOR). Specifically, we generate old-class +samples by inversing the original detectors, thus eliminating the necessity of +training and storing additional generative models. We propose augmented replay +to reuse the objects in generated samples, reducing redundant generations. +Moreover, we propose high-value knowledge distillation focusing on the +positions of old-class objects overwhelmed by the background, which transfers +the knowledge to the incremental detector. Extensive experiments conducted on +MS COCO 2017 demonstrate that our method can efficiently improve detection +performance in IOD scenarios with the absence of old-class objects. + +
+
+
+
+
+ + ♻ ☆ Skinned Motion Retargeting with Dense Geometric Interaction Perception NeurIPS 2024 + + +
+ Capturing and maintaining geometric interactions among different body parts +is crucial for successful motion retargeting in skinned characters. Existing +approaches often overlook body geometries or add a geometry correction stage +after skeletal motion retargeting. This results in conflicts between skeleton +interaction and geometry correction, leading to issues such as jittery, +interpenetration, and contact mismatches. To address these challenges, we +introduce a new retargeting framework, MeshRet, which directly models the dense +geometric interactions in motion retargeting. Initially, we establish dense +mesh correspondences between characters using semantically consistent sensors +(SCS), effective across diverse mesh topologies. Subsequently, we develop a +novel spatio-temporal representation called the dense mesh interaction (DMI) +field. This field, a collection of interacting SCS feature vectors, skillfully +captures both contact and non-contact interactions between body geometries. By +aligning the DMI field during retargeting, MeshRet not only preserves motion +semantics but also prevents self-interpenetration and ensures contact +preservation. Extensive experiments on the public Mixamo dataset and our +newly-collected ScanRet dataset demonstrate that MeshRet achieves +state-of-the-art performance. Code available at +https://github.com/abcyzj/MeshRet. + +
+
+ comment: NeurIPS 2024 Spotlight +
+
+
+
+
+ + ♻ ☆ reBEN: Refined BigEarthNet Dataset for Remote Sensing Image Analysis + + +
+ This paper presents refined BigEarthNet (reBEN) that is a large-scale, +multi-modal remote sensing dataset constructed to support deep learning (DL) +studies for remote sensing image analysis. The reBEN dataset consists of +549,488 pairs of Sentinel-1 and Sentinel-2 image patches. To construct reBEN, +we initially consider the Sentinel-1 and Sentinel-2 tiles used to construct the +BigEarthNet dataset and then divide them into patches of size 1200 m x 1200 m. +We apply atmospheric correction to the Sentinel-2 patches using the latest +version of the sen2cor tool, resulting in higher-quality patches compared to +those present in BigEarthNet. Each patch is then associated with a pixel-level +reference map and scene-level multi-labels. This makes reBEN suitable for +pixel- and scene-based learning tasks. The labels are derived from the most +recent CORINE Land Cover (CLC) map of 2018 by utilizing the 19-class +nomenclature as in BigEarthNet. The use of the most recent CLC map results in +overcoming the label noise present in BigEarthNet. Furthermore, we introduce a +new geographical-based split assignment algorithm that significantly reduces +the spatial correlation among the train, validation, and test sets with respect +to those present in BigEarthNet. This increases the reliability of the +evaluation of DL models. To minimize the DL model training time, we introduce +software tools that convert the reBEN dataset into a DL-optimized data format. +In our experiments, we show the potential of reBEN for multi-modal multi-label +image classification problems by considering several state-of-the-art DL +models. The pre-trained model weights, associated code, and complete dataset +are available at https://bigearth.net. + +
+
+
+
+
+ + ♻ ☆ DehazeGS: Seeing Through Fog with 3D Gaussian Splatting + + +
+ Current novel view synthesis tasks primarily rely on high-quality and clear +images. However, in foggy scenes, scattering and attenuation can significantly +degrade the reconstruction and rendering quality. Although NeRF-based dehazing +reconstruction algorithms have been developed, their use of deep fully +connected neural networks and per-ray sampling strategies leads to high +computational costs. Moreover, NeRF's implicit representation struggles to +recover fine details from hazy scenes. In contrast, recent advancements in 3D +Gaussian Splatting achieve high-quality 3D scene reconstruction by explicitly +modeling point clouds into 3D Gaussians. In this paper, we propose leveraging +the explicit Gaussian representation to explain the foggy image formation +process through a physically accurate forward rendering process. We introduce +DehazeGS, a method capable of decomposing and rendering a fog-free background +from participating media using only muti-view foggy images as input. We model +the transmission within each Gaussian distribution to simulate the formation of +fog. During this process, we jointly learn the atmospheric light and scattering +coefficient while optimizing the Gaussian representation of the hazy scene. In +the inference stage, we eliminate the effects of scattering and attenuation on +the Gaussians and directly project them onto a 2D plane to obtain a clear view. +Experiments on both synthetic and real-world foggy datasets demonstrate that +DehazeGS achieves state-of-the-art performance in terms of both rendering +quality and computational efficiency. visualizations are available at +https://dehazegs.github.io/ + +
+
+ comment: 9 pages,4 figures +
+
+
+
+
+ + ♻ ☆ StructSR: Refuse Spurious Details in Real-World Image Super-Resolution + + +
+ Diffusion-based models have shown great promise in real-world image +super-resolution (Real-ISR), but often generate content with structural errors +and spurious texture details due to the empirical priors and illusions of these +models. To address this issue, we introduce StructSR, a simple, effective, and +plug-and-play method that enhances structural fidelity and suppresses spurious +details for diffusion-based Real-ISR. StructSR operates without the need for +additional fine-tuning, external model priors, or high-level semantic +knowledge. At its core is the Structure-Aware Screening (SAS) mechanism, which +identifies the image with the highest structural similarity to the +low-resolution (LR) input in the early inference stage, allowing us to leverage +it as a historical structure knowledge to suppress the generation of spurious +details. By intervening in the diffusion inference process, StructSR seamlessly +integrates with existing diffusion-based Real-ISR models. Our experimental +results demonstrate that StructSR significantly improves the fidelity of +structure and texture, improving the PSNR and SSIM metrics by an average of +5.27% and 9.36% on a synthetic dataset (DIV2K-Val) and 4.13% and 8.64% on two +real-world datasets (RealSR and DRealSR) when integrated with four +state-of-the-art diffusion-based Real-ISR methods. + +
+
+
+
+
+ + ♻ ☆ Direct Unlearning Optimization for Robust and Safe Text-to-Image Models NeurIPS 2024 + + +
+ Recent advancements in text-to-image (T2I) models have unlocked a wide range +of applications but also present significant risks, particularly in their +potential to generate unsafe content. To mitigate this issue, researchers have +developed unlearning techniques to remove the model's ability to generate +potentially harmful content. However, these methods are easily bypassed by +adversarial attacks, making them unreliable for ensuring the safety of +generated images. In this paper, we propose Direct Unlearning Optimization +(DUO), a novel framework for removing Not Safe For Work (NSFW) content from T2I +models while preserving their performance on unrelated topics. DUO employs a +preference optimization approach using curated paired image data, ensuring that +the model learns to remove unsafe visual concepts while retaining unrelated +features. Furthermore, we introduce an output-preserving regularization term to +maintain the model's generative capabilities on safe content. Extensive +experiments demonstrate that DUO can robustly defend against various +state-of-the-art red teaming methods without significant performance +degradation on unrelated topics, as measured by FID and CLIP scores. Our work +contributes to the development of safer and more reliable T2I models, paving +the way for their responsible deployment in both closed-source and open-source +scenarios. + +
+
+ comment: This paper has been accepted for NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Geometric Distortion Guided Transformer for Omnidirectional Image + Super-Resolution + + +
+ As virtual and augmented reality applications gain popularity, +omnidirectional image (ODI) super-resolution has become increasingly important. +Unlike 2D plain images that are formed on a plane, ODIs are projected onto +spherical surfaces. Applying established image super-resolution methods to +ODIs, therefore, requires performing equirectangular projection (ERP) to map +the ODIs onto a plane. ODI super-resolution needs to take into account +geometric distortion resulting from ERP. However, without considering such +geometric distortion of ERP images, previous deep-learning-based methods only +utilize a limited range of pixels and may easily miss self-similar textures for +reconstruction. In this paper, we introduce a novel Geometric Distortion Guided +Transformer for Omnidirectional image Super-Resolution (GDGT-OSR). +Specifically, a distortion modulated rectangle-window self-attention mechanism, +integrated with deformable self-attention, is proposed to better perceive the +distortion and thus involve more self-similar textures. Distortion modulation +is achieved through a newly devised distortion guidance generator that produces +guidance by exploiting the variability of distortion across latitudes. +Furthermore, we propose a dynamic feature aggregation scheme to adaptively fuse +the features from different self-attention modules. We present extensive +experimental results on public datasets and show that the new GDGT-OSR +outperforms methods in existing literature. + +
+
+ comment: 13 pages, 12 figures, journal +
+
+
+
+
+ + ♻ ☆ iFADIT: Invertible Face Anonymization via Disentangled Identity + Transform + + +
+ Face anonymization aims to conceal the visual identity of a face to safeguard +the individual's privacy. Traditional methods like blurring and pixelation can +largely remove identifying features, but these techniques significantly degrade +image quality and are vulnerable to deep reconstruction attacks. Generative +models have emerged as a promising solution for anonymizing faces while +preserving a natural appearance. However, many still face limitations in visual +quality and often overlook the potential to recover the original face from the +anonymized version, which can be valuable in specific contexts such as image +forensics. This paper proposes a novel framework named iFADIT, an acronym for +Invertible Face Anonymization via Disentangled Identity Transform. The +framework features a disentanglement architecture coupled with a secure +flow-based model: the former decouples identity information from +non-identifying attributes, while the latter transforms the decoupled identity +into an anonymized version in an invertible manner controlled by a secret key. +The anonymized face can then be reconstructed based on a pre-trained StyleGAN +that ensures high image quality and realistic facial details. Recovery of the +original face (aka de-anonymization) is possible upon the availability of the +matching secret, by inverting the anonymization process based on the same set +of model parameters. Furthermore, a dedicated secret-key mechanism along with a +dual-phase training strategy is devised to ensure the desired properties of +face anonymization. Qualitative and quantitative experiments demonstrate the +superiority of the proposed approach in anonymity, reversibility, security, +diversity, and interpretability over competing methods. + +
+
+
+
+
+ + ♻ ☆ Go-with-the-Flow: Motion-Controllable Video Diffusion Models Using + Real-Time Warped Noise + + +
+ Generative modeling aims to transform random noise into structured outputs. +In this work, we enhance video diffusion models by allowing motion control via +structured latent noise sampling. This is achieved by just a change in data: we +pre-process training videos to yield structured noise. Consequently, our method +is agnostic to diffusion model design, requiring no changes to model +architectures or training pipelines. Specifically, we propose a novel noise +warping algorithm, fast enough to run in real time, that replaces random +temporal Gaussianity with correlated warped noise derived from optical flow +fields, while preserving the spatial Gaussianity. The efficiency of our +algorithm enables us to fine-tune modern video diffusion base models using +warped noise with minimal overhead, and provide a one-stop solution for a wide +range of user-friendly motion control: local object motion control, global +camera movement control, and motion transfer. The harmonization between +temporal coherence and spatial Gaussianity in our warped noise leads to +effective motion control while maintaining per-frame pixel quality. Extensive +experiments and user studies demonstrate the advantages of our method, making +it a robust and scalable approach for controlling motion in video diffusion +models. Video results are available on our webpage: +https://vgenai-netflix-eyeline-research.github.io/Go-with-the-Flow. Source code +and model checkpoints are available on GitHub: +https://github.com/VGenAI-Netflix-Eyeline-Research/Go-with-the-Flow. + +
+
+
+
+
+ + ♻ ☆ Point-PRC: A Prompt Learning Based Regulation Framework for + Generalizable Point Cloud Analysis NeurIPS 2024 + + +
+ This paper investigates the 3D domain generalization (3DDG) ability of large +3D models based on prevalent prompt learning. Recent works demonstrate the +performances of 3D point cloud recognition can be boosted remarkably by +parameter-efficient prompt tuning. However, we observe that the improvement on +downstream tasks comes at the expense of a severe drop in 3D domain +generalization. To resolve this challenge, we present a comprehensive +regulation framework that allows the learnable prompts to actively interact +with the well-learned general knowledge in large 3D models to maintain good +generalization. Specifically, the proposed framework imposes multiple explicit +constraints on the prompt learning trajectory by maximizing the mutual +agreement between task-specific predictions and task-agnostic knowledge. We +design the regulation framework as a plug-and-play module to embed into +existing representative large 3D models. Surprisingly, our method not only +realizes consistently increasing generalization ability but also enhances +task-specific 3D recognition performances across various 3DDG benchmarks by a +clear margin. Considering the lack of study and evaluation on 3DDG, we also +create three new benchmarks, namely base-to-new, cross-dataset and few-shot +generalization benchmarks, to enrich the field and inspire future research. +Code and benchmarks are available at +\url{https://github.com/auniquesun/Point-PRC}. + +
+
+ comment: 5 figures, 14 tables; accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ CMRxRecon2024: A Multi-Modality, Multi-View K-Space Dataset Boosting + Universal Machine Learning for Accelerated Cardiac MRI + + +
+ Cardiac magnetic resonance imaging (MRI) has emerged as a clinically +gold-standard technique for diagnosing cardiac diseases, thanks to its ability +to provide diverse information with multiple modalities and anatomical views. +Accelerated cardiac MRI is highly expected to achieve time-efficient and +patient-friendly imaging, and then advanced image reconstruction approaches are +required to recover high-quality, clinically interpretable images from +undersampled measurements. However, the lack of publicly available cardiac MRI +k-space dataset in terms of both quantity and diversity has severely hindered +substantial technological progress, particularly for data-driven artificial +intelligence. Here, we provide a standardized, diverse, and high-quality +CMRxRecon2024 dataset to facilitate the technical development, fair evaluation, +and clinical transfer of cardiac MRI reconstruction approaches, towards +promoting the universal frameworks that enable fast and robust reconstructions +across different cardiac MRI protocols in clinical practice. To the best of our +knowledge, the CMRxRecon2024 dataset is the largest and most protocal-diverse +publicly available cardiac k-space dataset. It is acquired from 330 healthy +volunteers, covering commonly used modalities, anatomical views, and +acquisition trajectories in clinical cardiac MRI workflows. Besides, an open +platform with tutorials, benchmarks, and data processing tools is provided to +facilitate data usage, advanced method development, and fair performance +evaluation. + +
+
+ comment: 23 pages, 3 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ VLG-CBM: Training Concept Bottleneck Models with Vision-Language + Guidance NeurIPS 2024 + + +
+ Concept Bottleneck Models (CBMs) provide interpretable prediction by +introducing an intermediate Concept Bottleneck Layer (CBL), which encodes +human-understandable concepts to explain models' decision. Recent works +proposed to utilize Large Language Models and pre-trained Vision-Language +Models to automate the training of CBMs, making it more scalable and automated. +However, existing approaches still fall short in two aspects: First, the +concepts predicted by CBL often mismatch the input image, raising doubts about +the faithfulness of interpretation. Second, it has been shown that concept +values encode unintended information: even a set of random concepts could +achieve comparable test accuracy to state-of-the-art CBMs. To address these +critical limitations, in this work, we propose a novel framework called +Vision-Language-Guided Concept Bottleneck Model (VLG-CBM) to enable faithful +interpretability with the benefits of boosted performance. Our method leverages +off-the-shelf open-domain grounded object detectors to provide visually +grounded concept annotation, which largely enhances the faithfulness of concept +prediction while further improving the model performance. In addition, we +propose a new metric called Number of Effective Concepts (NEC) to control the +information leakage and provide better interpretability. Extensive evaluations +across five standard benchmarks show that our method, VLG-CBM, outperforms +existing methods by at least 4.27% and up to 51.09% on Accuracy at NEC=5 +(denoted as ANEC-5), and by at least 0.45% and up to 29.78% on average accuracy +(denoted as ANEC-avg), while preserving both faithfulness and interpretability +of the learned concepts as demonstrated in extensive experiments. + +
+
+ comment: Appeared at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Synthesizing Forestry Images Conditioned on Plant Phenotype Using a + Generative Adversarial Network + + +
+ Plant phenology and phenotype prediction using remote sensing data are +increasingly gaining attention within the plant science community as a +promising approach to enhance agricultural productivity. This work focuses on +generating synthetic forestry images that satisfy certain phenotypic +attributes, viz. canopy greenness. We harness a Generative Adversarial Network +(GAN) to synthesize biologically plausible and phenotypically stable forestry +images conditioned on the greenness of vegetation (a continuous attribute) over +a specific region of interest, describing a particular vegetation type in a +mixed forest. The training data is based on the automated digital camera +imagery provided by the National Ecological Observatory Network (NEON) and +processed by the PhenoCam Network. Our method helps render the appearance of +forest sites specific to a greenness value. The synthetic images are +subsequently utilized to predict another phenotypic attribute, viz., redness of +plants. The quality of the synthetic images is assessed using the Structural +SIMilarity (SSIM) index and Fr\'echet Inception Distance (FID). Further, the +greenness and redness indices of the synthetic images are compared against +those of the original images using Root Mean Squared Percentage Error (RMSPE) +to evaluate their accuracy and integrity. The generalizability and scalability +of our proposed GAN model are established by effectively transforming it to +generate synthetic images for other forest sites and vegetation types. From a +broader perspective, this approach could be leveraged to visualize forestry +based on different phenotypic attributes in the context of various +environmental parameters. + +
+
+ comment: Accepted to Pattern Recognition journal +
+
+
+
+
+ + ♻ ☆ BRIGHT-VO: Brightness-Guided Hybrid Transformer for Visual Odometry with + Multi-modality Refinement Module + + +
+ Visual odometry (VO) plays a crucial role in autonomous driving, robotic +navigation, and other related tasks by estimating the position and orientation +of a camera based on visual input. Significant progress has been made in +data-driven VO methods, particularly those leveraging deep learning techniques +to extract image features and estimate camera poses. However, these methods +often struggle in low-light conditions because of the reduced visibility of +features and the increased difficulty of matching keypoints. To address this +limitation, we introduce BrightVO, a novel VO model based on Transformer +architecture, which not only performs front-end visual feature extraction, but +also incorporates a multi-modality refinement module in the back-end that +integrates Inertial Measurement Unit (IMU) data. Using pose graph optimization, +this module iteratively refines pose estimates to reduce errors and improve +both accuracy and robustness. Furthermore, we create a synthetic low-light +dataset, KiC4R, which includes a variety of lighting conditions to facilitate +the training and evaluation of VO frameworks in challenging environments. +Experimental results demonstrate that BrightVO achieves state-of-the-art +performance on both the KiC4R dataset and the KITTI benchmarks. Specifically, +it provides an average improvement of 20% in pose estimation accuracy in normal +outdoor environments and 259% in low-light conditions, outperforming existing +methods. For widespread use and further development, the research work is fully +open-source at https://github.com/Anastasiawd/BrightVO. + +
+
+ comment: We have identified significant issues in the methodology and data + analysis that impact the validity of our conclusions +
+
+
+
+
+ + ♻ ☆ A General Framework for Inference-time Scaling and Steering of Diffusion + Models + + +
+ Diffusion models produce impressive results in modalities ranging from images +and video to protein design and text. However, generating samples with +user-specified properties remains a challenge. Recent research proposes +fine-tuning models to maximize rewards that capture desired properties, but +these methods require expensive training and are prone to mode collapse. In +this work, we propose Feynman Kac (FK) steering, an inference-time framework +for steering diffusion models with reward functions. FK steering works by +sampling a system of multiple interacting diffusion processes, called +particles, and resampling particles at intermediate steps based on scores +computed using functions called potentials. Potentials are defined using +rewards for intermediate states and are selected such that a high value +indicates that the particle will yield a high-reward sample. We explore various +choices of potentials, intermediate rewards, and samplers. We evaluate FK +steering on text-to-image and text diffusion models. For steering text-to-image +models with a human preference reward, we find that FK steering a 0.8B +parameter model outperforms a 2.6B parameter fine-tuned model on prompt +fidelity, with faster sampling and no training. For steering text diffusion +models with rewards for text quality and specific text attributes, we find that +FK steering generates lower perplexity, more linguistically acceptable outputs +and enables gradient-free control of attributes like toxicity. Our results +demonstrate that inference-time scaling and steering of diffusion models, even +with off-the-shelf rewards, can provide significant sample quality gains and +controllability benefits. Code is available at +https://github.com/zacharyhorvitz/Fk-Diffusion-Steering . + +
+
+
+
+
+ + ♻ ☆ DiffMesh: A Motion-aware Diffusion Framework for Human Mesh Recovery + from Videos + + +
+ Human mesh recovery (HMR) provides rich human body information for various +real-world applications. While image-based HMR methods have achieved impressive +results, they often struggle to recover humans in dynamic scenarios, leading to +temporal inconsistencies and non-smooth 3D motion predictions due to the +absence of human motion. In contrast, video-based approaches leverage temporal +information to mitigate this issue. In this paper, we present DiffMesh, an +innovative motion-aware Diffusion-like framework for video-based HMR. DiffMesh +establishes a bridge between diffusion models and human motion, efficiently +generating accurate and smooth output mesh sequences by incorporating human +motion within the forward process and reverse process in the diffusion model. +Extensive experiments are conducted on the widely used datasets (Human3.6M +\cite{h36m_pami} and 3DPW \cite{pw3d2018}), which demonstrate the effectiveness +and efficiency of our DiffMesh. Visual comparisons in real-world scenarios +further highlight DiffMesh's suitability for practical applications. + +
+
+ comment: WACV 2025 +
+
+
+
+
+ + ♻ ☆ Towards Balanced Continual Multi-Modal Learning in Human Pose Estimation + + +
+ 3D human pose estimation (3D HPE) has emerged as a prominent research topic, +particularly in the realm of RGB-based methods. However, RGB images are +susceptible to limitations such as sensitivity to lighting conditions and +potential user discomfort. Consequently, multi-modal sensing, which leverages +non-intrusive sensors, is gaining increasing attention. Nevertheless, +multi-modal 3D HPE still faces challenges, including modality imbalance and the +imperative for continual learning. In this work, we introduce a novel balanced +continual multi-modal learning method for 3D HPE, which harnesses the power of +RGB, LiDAR, mmWave, and WiFi. Specifically, we propose a Shapley value-based +contribution algorithm to quantify the contribution of each modality and +identify modality imbalance. To address this imbalance, we employ a re-learning +strategy. Furthermore, recognizing that raw data is prone to noise +contamination, we develop a novel denoising continual learning approach. This +approach incorporates a noise identification and separation module to mitigate +the adverse effects of noise and collaborates with the balanced learning +strategy to enhance optimization. Additionally, an adaptive EWC mechanism is +employed to alleviate catastrophic forgetting. We conduct extensive experiments +on the widely-adopted multi-modal dataset, MM-Fi, which demonstrate the +superiority of our approach in boosting 3D pose estimation and mitigating +catastrophic forgetting in complex scenarios. We will release our codes. + +
+
+
+
+
+ + ♻ ☆ Collaboration in Immersive Environments: Challenges and Solutions + + +
+ Virtual Reality (VR) and Augmented Reality (AR) tools have been applied in +all engineering fields in order to avoid the use of physical prototypes, to +train in high-risk situations, and to interpret real or simulated results. In +order to complete a shared task or assign tasks to the agents in such immersive +environments, collaboration or Shared Cooperative Activities are a necessity. +Collaboration in immersive environments is an emerging field of research that +aims to study and enhance the ways in which people interact and work together +in Virtual and Augmented Reality settings. Collaboration in immersive +environments is a complex process that involves different factors such as +communication, coordination, and social presence. This paper provides an +overview of the current state of research on collaboration in immersive +environments. It discusses the different types of immersive environments, +including VR and AR, and the different forms of collaboration that can occur in +these environments. The paper also highlights the challenges and limitations of +collaboration in immersive environments, such as the lack of physical cues, +cost and usability and the need for further research in this area. Overall, +collaboration in immersive environments is a promising field with a wide range +of potential applications, from education to industry, and it can benefit both +individuals and groups by enhancing their ability to work together effectively. + +
+
+ comment: Added new references in Networking section +
+
+
+
+
+ + ♻ ☆ Rethinking Pre-Trained Feature Extractor Selection in Multiple Instance + Learning for Whole Slide Image Classification + + +
+ Multiple instance learning (MIL) has become a preferred method for gigapixel +whole slide image (WSI) classification without requiring patch-level +annotations. Current MIL research primarily relies on embedding-based +approaches, which extract patch features using a pre-trained feature extractor +and aggregate them for slide-level prediction. Despite the critical role of +feature extraction, there is limited guidance on selecting optimal feature +extractors to maximize WSI performance. This study addresses this gap by +systematically evaluating MIL feature extractors across three dimensions: +pre-training dataset, backbone model, and pre-training method. Extensive +experiments were conducted on two public WSI datasets (TCGA-NSCLC and +Camelyon16) using four state-of-the-art (SOTA) MIL models. Our findings reveal +that: 1) selecting a robust self-supervised learning (SSL) method has a greater +impact on performance than relying solely on an in-domain pre-training dataset; +2) prioritizing Transformer-based backbones with deeper architectures over +CNN-based models; and 3) using larger, more diverse pre-training datasets +significantly enhances classification outcomes. We hope that these insights can +provide practical guidance for optimizing WSI classification and explain the +reasons behind the performance advantages of the current SOTA pathology +foundation models. Furthermore, this work may inform the development of more +effective pathology foundation models. Our code is publicly available at +https://github.com/bryanwong17/MIL-Feature-Extractor-Selection + +
+
+ comment: Accepted to IEEE International Symposium on Biomedical Imaging (ISBI) + 2025 +
+
+
+
+
+ + ♻ ☆ PhysMamba: State Space Duality Model for Remote Physiological + Measurement + + +
+ Remote Photoplethysmography (rPPG) enables non-contact physiological signal +extraction from facial videos, offering applications in psychological state +analysis, medical assistance, and anti-face spoofing. However, challenges such +as motion artifacts, lighting variations, and noise limit its real-world +applicability. To address these issues, we propose PhysMamba, a novel +dual-pathway time-frequency interaction model based on Synergistic State Space +Duality (SSSD), which for the first time integrates state space models with +attention mechanisms in a dual-branch framework. Combined with a Multi-Scale +Query (MQ) mechanism, PhysMamba achieves efficient information exchange and +enhanced feature representation, ensuring robustness under noisy and dynamic +conditions. Experiments on PURE, UBFC-rPPG, and MMPD datasets demonstrate that +PhysMamba outperforms state-of-the-art methods, offering superior accuracy and +generalization. This work lays a strong foundation for practical applications +in non-contact health monitoring, including real-time remote patient care. + +
+
+
+
+
+ + ♻ ☆ Enhanced Masked Image Modeling to Avoid Model Collapse on Multi-modal + MRI Datasets + + +
+ Multi-modal magnetic resonance imaging (MRI) provides information of lesions +for computer-aided diagnosis from different views. Deep learning algorithms are +suitable for identifying specific anatomical structures, segmenting lesions, +and classifying diseases. Manual labels are limited due to the high expense, +which hinders further improvement of accuracy. Self-supervised learning, +particularly masked image modeling (MIM), has shown promise in utilizing +unlabeled data. However, we spot model collapse when applying MIM to +multi-modal MRI datasets. The performance of downstream tasks does not see any +improvement following the collapsed model. To solve model collapse, we analyze +and address it in two types: complete collapse and dimensional collapse. We +find complete collapse occurs because the collapsed loss value in multi-modal +MRI datasets falls below the normally converged loss value. Based on this, the +hybrid mask pattern (HMP) masking strategy is introduced to elevate the +collapsed loss above the normally converged loss value and avoid complete +collapse. Additionally, we reveal that dimensional collapse stems from +insufficient feature uniformity in MIM. We mitigate dimensional collapse by +introducing the pyramid barlow twins (PBT) module as an explicit regularization +method. Overall, we construct the enhanced MIM (E-MIM) with HMP and PBT module +to avoid model collapse multi-modal MRI. Experiments are conducted on three +multi-modal MRI datasets to validate the effectiveness of our approach in +preventing both types of model collapse. By preventing model collapse, the +training of the model becomes more stable, resulting in a decent improvement in +performance for segmentation and classification tasks. The code is available at +https://github.com/LinxuanHan/E-MIM. + +
+
+ comment: This work has been submitted to the lEEE for possible publication. + copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ CryoBench: Diverse and challenging datasets for the heterogeneity + problem in cryo-EM NeurIPS 2024 + + +
+ Cryo-electron microscopy (cryo-EM) is a powerful technique for determining +high-resolution 3D biomolecular structures from imaging data. Its unique +ability to capture structural variability has spurred the development of +heterogeneous reconstruction algorithms that can infer distributions of 3D +structures from noisy, unlabeled imaging data. Despite the growing number of +advanced methods, progress in the field is hindered by the lack of standardized +benchmarks with ground truth information and reliable validation metrics. Here, +we introduce CryoBench, a suite of datasets, metrics, and benchmarks for +heterogeneous reconstruction in cryo-EM. CryoBench includes five datasets +representing different sources of heterogeneity and degrees of difficulty. +These include conformational heterogeneity generated from designed motions of +antibody complexes or sampled from a molecular dynamics simulation, as well as +compositional heterogeneity from mixtures of ribosome assembly states or 100 +common complexes present in cells. We then analyze state-of-the-art +heterogeneous reconstruction tools, including neural and non-neural methods, +assess their sensitivity to noise, and propose new metrics for quantitative +evaluation. We hope that CryoBench will be a foundational resource for +accelerating algorithmic development and evaluation in the cryo-EM and machine +learning communities. Project page: https://cryobench.cs.princeton.edu. + +
+
+ comment: Accepted by NeurIPS 2024 (Spotlight) +
+
+
+
+
+ + ♻ ☆ Swin transformers are robust to distribution and concept drift in + endoscopy-based longitudinal rectal cancer assessment + + +
+ Endoscopic images are used at various stages of rectal cancer treatment +starting from cancer screening, diagnosis, during treatment to assess response +and toxicity from treatments such as colitis, and at follow up to detect new +tumor or local regrowth (LR). However, subjective assessment is highly variable +and can underestimate the degree of response in some patients, subjecting them +to unnecessary surgery, or overestimate response that places patients at risk +of disease spread. Advances in deep learning has shown the ability to produce +consistent and objective response assessment for endoscopic images. However, +methods for detecting cancers, regrowth, and monitoring response during the +entire course of patient treatment and follow-up are lacking. This is because, +automated diagnosis and rectal cancer response assessment requires methods that +are robust to inherent imaging illumination variations and confounding +conditions (blood, scope, blurring) present in endoscopy images as well as +changes to the normal lumen and tumor during treatment. Hence, a hierarchical +shifted window (Swin) transformer was trained to distinguish rectal cancer from +normal lumen using endoscopy images. Swin as well as two convolutional +(ResNet-50, WideResNet-50), and vision transformer (ViT) models were trained +and evaluated on follow-up longitudinal images to detect LR on private dataset +as well as on out-of-distribution (OOD) public colonoscopy datasets to detect +pre/non-cancerous polyps. Color shifts were applied using optimal transport to +simulate distribution shifts. Swin and ResNet models were similarly accurate in +the in-distribution dataset. Swin was more accurate than other methods +(follow-up: 0.84, OOD: 0.83) even when subject to color shifts (follow-up: +0.83, OOD: 0.87), indicating capability to provide robust performance for +longitudinal cancer assessment. + +
+
+ comment: The work has been accepted for publication in 2024 SPIE Medical + Imaging conference proceedings +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 27 + +
+
+
+ + ☆ Applying General Turn-taking Models to Conversational Human-Robot + Interaction + + +
+ Turn-taking is a fundamental aspect of conversation, but current Human-Robot +Interaction (HRI) systems often rely on simplistic, silence-based models, +leading to unnatural pauses and interruptions. This paper investigates, for the +first time, the application of general turn-taking models, specifically TurnGPT +and Voice Activity Projection (VAP), to improve conversational dynamics in HRI. +These models are trained on human-human dialogue data using self-supervised +learning objectives, without requiring domain-specific fine-tuning. We propose +methods for using these models in tandem to predict when a robot should begin +preparing responses, take turns, and handle potential interruptions. We +evaluated the proposed system in a within-subject study against a traditional +baseline system, using the Furhat robot with 39 adults in a conversational +setting, in combination with a large language model for autonomous response +generation. The results show that participants significantly prefer the +proposed system, and it significantly reduces response delays and +interruptions. + +
+
+ comment: Accepted at HRI 2025 (the IEEE/ACM International Conference on + Human-Robot Interaction) +
+
+
+
+
+ + ☆ A Reinforcement Learning Approach to Quiet and Safe UAM Traffic + Management + + +
+ Urban air mobility (UAM) is a transformative system that operates various +small aerial vehicles in urban environments to reshape urban transportation. +However, integrating UAM into existing urban environments presents a variety of +complex challenges. Recent analyses of UAM's operational constraints highlight +aircraft noise and system safety as key hurdles to UAM system implementation. +Future UAM air traffic management schemes must ensure that the system is both +quiet and safe. We propose a multi-agent reinforcement learning approach to +manage UAM traffic, aiming at both vertical separation assurance and noise +mitigation. Through extensive training, the reinforcement learning agent learns +to balance the two primary objectives by employing altitude adjustments in a +multi-layer UAM network. The results reveal the tradeoffs among noise impact, +traffic congestion, and separation. Overall, our findings demonstrate the +potential of reinforcement learning in mitigating UAM's noise impact while +maintaining safe separation using altitude adjustments + +
+
+ comment: Paper presented at SciTech 2025 +
+
+
+
+
+ + ☆ When Uncertainty Leads to Unsafety: Empirical Insights into the Role of + Uncertainty in Unmanned Aerial Vehicle Safety + + +
+ Despite the recent developments in obstacle avoidance and other safety +features, autonomous Unmanned Aerial Vehicles (UAVs) continue to face safety +challenges. No previous work investigated the relationship between the +behavioral uncertainty of a UAV and the unsafety of its flight. By quantifying +uncertainty, it is possible to develop a predictor for unsafety, which acts as +a flight supervisor. We conducted a large-scale empirical investigation of +safety violations using PX4-Autopilot, an open-source UAV software platform. +Our dataset of over 5,000 simulated flights, created to challenge obstacle +avoidance, allowed us to explore the relation between uncertain UAV decisions +and safety violations: up to 89% of unsafe UAV states exhibit significant +decision uncertainty, and up to 74% of uncertain decisions lead to unsafe +states. Based on these findings, we implemented Superialist (Supervising +Autonomous Aerial Vehicles), a runtime uncertainty detector based on +autoencoders, the state-of-the-art technology for anomaly detection. +Superialist achieved high performance in detecting uncertain behaviors with up +to 96% precision and 93% recall. Despite the observed performance degradation +when using the same approach for predicting unsafety (up to 74% precision and +87% recall), Superialist enabled early prediction of unsafe states up to 50 +seconds in advance. + +
+
+ comment: 36 pages +
+
+
+
+
+ + ☆ SLC$^2$-SLAM: Semantic-guided Loop Closure with Shared Latent Code for + NeRF SLAM + + +
+ Targeting the notorious cumulative drift errors in NeRF SLAM, we propose a +Semantic-guided Loop Closure with Shared Latent Code, dubbed SLC$^2$-SLAM. +Especially, we argue that latent codes stored in many NeRF SLAM systems are not +fully exploited, as they are only used for better reconstruction. In this +paper, we propose a simple yet effective way to detect potential loops using +the same latent codes as local features. To further improve the loop detection +performance, we use the semantic information, which are also decoded from the +same latent codes to guide the aggregation of local features. Finally, with the +potential loops detected, we close them with a graph optimization followed by +bundle adjustment to refine both the estimated poses and the reconstructed +scene. To evaluate the performance of our SLC$^2$-SLAM, we conduct extensive +experiments on Replica and ScanNet datasets. Our proposed semantic-guided loop +closure significantly outperforms the pre-trained NetVLAD and ORB combined with +Bag-of-Words, which are used in all the other NeRF SLAM with loop closure. As a +result, our SLC$^2$-SLAM also demonstrated better tracking and reconstruction +performance, especially in larger scenes with more loops, like ScanNet. + +
+
+ comment: 8 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ Task Allocation in Mobile Robot Fleets: A review + + +
+ Mobile robot fleets are currently used in different scenarios such as medical +environments or logistics. The management of these systems provides different +challenges that vary from the control of the movement of each robot to the +allocation of tasks to be performed. Task Allocation (TA) problem is a key +topic for the proper management of mobile robot fleets to ensure the +minimization of energy consumption and quantity of necessary robots. Solutions +on this aspect are essential to reach economic and environmental sustainability +of robot fleets, mainly in industry applications such as warehouse logistics. +The minimization of energy consumption introduces TA problem as an optimization +issue which has been treated in recent studies. This work focuses on the +analysis of current trends in solving TA of mobile robot fleets. Main TA +optimization algorithms are presented, including novel methods based on +Artificial Intelligence (AI). Additionally, this work showcases most important +results extracted from simulations, including frameworks utilized for the +development of the simulations. Finally, some conclusions are obtained from the +analysis to target on gaps that must be treated in the future. + +
+
+
+
+
+ + GS-LIVO: Real-Time LiDAR, Inertial, and Visual Multi-sensor Fused + Odometry with Gaussian Mapping + + +
+ In recent years, 3D Gaussian splatting (3D-GS) has emerged as a novel scene +representation approach. However, existing vision-only 3D-GS methods often rely +on hand-crafted heuristics for point-cloud densification and face challenges in +handling occlusions and high GPU memory and computation consumption. +LiDAR-Inertial-Visual (LIV) sensor configuration has demonstrated superior +performance in localization and dense mapping by leveraging complementary +sensing characteristics: rich texture information from cameras, precise +geometric measurements from LiDAR, and high-frequency motion data from IMU. +Inspired by this, we propose a novel real-time Gaussian-based simultaneous +localization and mapping (SLAM) system. Our map system comprises a global +Gaussian map and a sliding window of Gaussians, along with an IESKF-based +odometry. The global Gaussian map consists of hash-indexed voxels organized in +a recursive octree, effectively covering sparse spatial volumes while adapting +to different levels of detail and scales. The Gaussian map is initialized +through multi-sensor fusion and optimized with photometric gradients. Our +system incrementally maintains a sliding window of Gaussians, significantly +reducing GPU computation and memory consumption by only optimizing the map +within the sliding window. Moreover, we implement a tightly coupled +multi-sensor fusion odometry with an iterative error state Kalman filter +(IESKF), leveraging real-time updating and rendering of the Gaussian map. Our +system represents the first real-time Gaussian-based SLAM framework deployable +on resource-constrained embedded systems, demonstrated on the NVIDIA Jetson +Orin NX platform. The framework achieves real-time performance while +maintaining robust multi-sensor fusion capabilities. All implementation +algorithms, hardware designs, and CAD models will be publicly available. + +
+
+
+
+
+ + ☆ Application of Deep Reinforcement Learning to UAV Swarming for Ground + Surveillance + + +
+ This paper summarizes in depth the state of the art of aerial swarms, +covering both classical and new reinforcement-learning-based approaches for +their management. Then, it proposes a hybrid AI system, integrating deep +reinforcement learning in a multi-agent centralized swarm architecture. The +proposed system is tailored to perform surveillance of a specific area, +searching and tracking ground targets, for security and law enforcement +applications. The swarm is governed by a central swarm controller responsible +for distributing different search and tracking tasks among the cooperating +UAVs. Each UAV agent is then controlled by a collection of cooperative +sub-agents, whose behaviors have been trained using different deep +reinforcement learning models, tailored for the different task types proposed +by the swarm controller. More specifically, proximal policy optimization (PPO) +algorithms were used to train the agents' behavior. In addition, several +metrics to assess the performance of the swarm in this application were +defined. The results obtained through simulation show that our system searches +the operation area effectively, acquires the targets in a reasonable time, and +is capable of tracking them continuously and consistently. + +
+
+
+
+
+ + ☆ Self-Organizing Edge Computing Distribution Framework for Visual SLAM + + +
+ Localization within a known environment is a crucial capability for mobile +robots. Simultaneous Localization and Mapping (SLAM) is a prominent solution to +this problem. SLAM is a framework that consists of a diverse set of +computational tasks ranging from real-time tracking to computation-intensive +map optimization. This combination can present a challenge for resource-limited +mobile robots. Previously, edge-assisted SLAM methods have demonstrated +promising real-time execution capabilities by offloading heavy computations +while performing real-time tracking onboard. However, the common approach of +utilizing a client-server architecture for offloading is sensitive to server +and network failures. In this article, we propose a novel edge-assisted SLAM +framework capable of self-organizing fully distributed SLAM execution across a +network of devices or functioning on a single device without connectivity. The +architecture consists of three layers and is designed to be device-agnostic, +resilient to network failures, and minimally invasive to the core SLAM system. +We have implemented and demonstrated the framework for monocular ORB SLAM3 and +evaluated it in both fully distributed and standalone SLAM configurations +against the ORB SLAM3. The experiment results demonstrate that the proposed +design matches the accuracy and resource utilization of the monolithic approach +while enabling collaborative execution. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ Image-to-Force Estimation for Soft Tissue Interaction in + Robotic-Assisted Surgery Using Structured Light + + +
+ For Minimally Invasive Surgical (MIS) robots, accurate haptic interaction +force feedback is essential for ensuring the safety of interacting with soft +tissue. However, most existing MIS robotic systems cannot facilitate direct +measurement of the interaction force with hardware sensors due to space +limitations. This letter introduces an effective vision-based scheme that +utilizes a One-Shot structured light projection with a designed pattern on soft +tissue coupled with haptic information processing through a trained +image-to-force neural network. The images captured from the endoscopic stereo +camera are analyzed to reconstruct high-resolution 3D point clouds for soft +tissue deformation. Based on this, a modified PointNet-based force estimation +method is proposed, which excels in representing the complex mechanical +properties of soft tissue. Numerical force interaction experiments are +conducted on three silicon materials with different stiffness. The results +validate the effectiveness of the proposed scheme. + +
+
+
+
+
+ + ☆ GOTLoc: General Outdoor Text-based Localization Using Scene Graph + Retrieval with OpenStreetMap + + +
+ We propose GOTLoc, a robust localization method capable of operating even in +outdoor environments where GPS signals are unavailable. The method achieves +this robust localization by leveraging comparisons between scene graphs +generated from text descriptions and maps. Existing text-based localization +studies typically represent maps as point clouds and identify the most similar +scenes by comparing embeddings of text and point cloud data. However, point +cloud maps have limited scalability as it is impractical to pre-generate maps +for all outdoor spaces. Furthermore, their large data size makes it challenging +to store and utilize them directly on actual robots. To address these issues, +GOTLoc leverages compact data structures, such as scene graphs, to store +spatial information, enabling individual robots to carry and utilize large +amounts of map data. Additionally, by utilizing publicly available map data, +such as OpenStreetMap, which provides global information on outdoor spaces, we +eliminate the need for additional effort to create custom map data. For +performance evaluation, we utilized the KITTI360Pose dataset in conjunction +with corresponding OpenStreetMap data to compare the proposed method with +existing approaches. Our results demonstrate that the proposed method achieves +accuracy comparable to algorithms relying on point cloud maps. Moreover, in +city-scale tests, GOTLoc required significantly less storage compared to point +cloud-based methods and completed overall processing within a few seconds, +validating its applicability to real-world robotics. Our code is available at +https://github.com/donghwijung/GOTLoc. + +
+
+
+
+
+ + ☆ LAMS: LLM-Driven Automatic Mode Switching for Assistive Teleoperation + + +
+ Teleoperating high degrees-of-freedom (DoF) robotic manipulators via low-DoF +controllers like joysticks often requires frequent switching between control +modes, where each mode maps controller movements to specific robot actions. +Manually performing this frequent switching can make teleoperation cumbersome +and inefficient. On the other hand, existing automatic mode-switching +solutions, such as heuristic-based or learning-based methods, are often +task-specific and lack generalizability. In this paper, we introduce LLM-Driven +Automatic Mode Switching (LAMS), a novel approach that leverages Large Language +Models (LLMs) to automatically switch control modes based on task context. +Unlike existing methods, LAMS requires no prior task demonstrations and +incrementally improves by integrating user-generated mode-switching examples. +We validate LAMS through an ablation study and a user study with 10 +participants on complex, long-horizon tasks, demonstrating that LAMS +effectively reduces manual mode switches, is preferred over alternative +methods, and improves performance over time. The project website with +supplementary materials is at https://lams-assistance.github.io/. + +
+
+
+
+
+ + ☆ Chance-Constrained Sampling-Based MPC for Collision Avoidance in + Uncertain Dynamic Environments + + +
+ Navigating safely in dynamic and uncertain environments is challenging due to +uncertainties in perception and motion. This letter presents C2U-MPPI, a robust +sampling-based Model Predictive Control (MPC) framework that addresses these +challenges by leveraging the Unscented Model Predictive Path Integral (U-MPPI) +control strategy with integrated probabilistic chance constraints, ensuring +more reliable and efficient navigation under uncertainty. Unlike gradient-based +MPC methods, our approach (i) avoids linearization of system dynamics and +directly applies non-convex and nonlinear chance constraints, enabling more +accurate and flexible optimization, and (ii) enhances computational efficiency +by reformulating probabilistic constraints into a deterministic form and +employing a layered dynamic obstacle representation, enabling real-time +handling of multiple obstacles. Extensive experiments in simulated and +real-world human-shared environments validate the effectiveness of our +algorithm against baseline methods, showcasing its capability to generate +feasible trajectories and control inputs that adhere to system dynamics and +constraints in dynamic settings, enabled by unscented-based sampling strategy +and risk-sensitive trajectory evaluation. A supplementary video is available +at: https://youtu.be/FptAhvJlQm8 + +
+
+ comment: This paper has 8 pages, 2 figures, 5 tables +
+
+
+
+
+ + ☆ A Framework for Dynamic Situational Awareness in Human Robot Teams: An + Interview Study + + +
+ In human-robot teams, human situational awareness is the operator's conscious +knowledge of the team's states, actions, plans and their environment. +Appropriate human situational awareness is critical to successful human-robot +collaboration. In human-robot teaming, it is often assumed that the best and +required level of situational awareness is knowing everything at all times. +This view is problematic, because what a human needs to know for optimal team +performance varies given the dynamic environmental conditions, task context and +roles and capabilities of team members. We explore this topic by interviewing +16 participants with active and repeated experience in diverse human-robot +teaming applications. Based on analysis of these interviews, we derive a +framework explaining the dynamic nature of required situational awareness in +human-robot teaming. In addition, we identify a range of factors affecting the +dynamic nature of required and actual levels of situational awareness (i.e., +dynamic situational awareness), types of situational awareness inefficiencies +resulting from gaps between actual and required situational awareness, and +their main consequences. We also reveal various strategies, initiated by humans +and robots, that assist in maintaining the required situational awareness. Our +findings inform the implementation of accurate estimates of dynamic situational +awareness and the design of user-adaptive human-robot interfaces. Therefore, +this work contributes to the future design of more collaborative and effective +human-robot teams. + +
+
+
+
+
+ + ☆ Unified Few-shot Crack Segmentation and its Precise 3D Automatic + Measurement in Concrete Structures + + +
+ Visual-Spatial Systems has become increasingly essential in concrete crack +inspection. However, existing methods often lacks adaptability to diverse +scenarios, exhibits limited robustness in image-based approaches, and struggles +with curved or complex geometries. To address these limitations, an innovative +framework for two-dimensional (2D) crack detection, three-dimensional (3D) +reconstruction, and 3D automatic crack measurement was proposed by integrating +computer vision technologies and multi-modal Simultaneous localization and +mapping (SLAM) in this study. Firstly, building on a base DeepLabv3+ +segmentation model, and incorporating specific refinements utilizing foundation +model Segment Anything Model (SAM), we developed a crack segmentation method +with strong generalization across unfamiliar scenarios, enabling the generation +of precise 2D crack masks. To enhance the accuracy and robustness of 3D +reconstruction, Light Detection and Ranging (LiDAR) point clouds were utilized +together with image data and segmentation masks. By leveraging both image- and +LiDAR-SLAM, we developed a multi-frame and multi-modal fusion framework that +produces dense, colorized point clouds, effectively capturing crack semantics +at a 3D real-world scale. Furthermore, the crack geometric attributions were +measured automatically and directly within 3D dense point cloud space, +surpassing the limitations of conventional 2D image-based measurements. This +advancement makes the method suitable for structural components with curved and +complex 3D geometries. Experimental results across various concrete structures +highlight the significant improvements and unique advantages of the proposed +method, demonstrating its effectiveness, accuracy, and robustness in real-world +applications. + +
+
+
+
+
+ + ☆ Combining Movement Primitives with Contraction Theory RA-L + + +
+ This paper presents a modular framework for motion planning using movement +primitives. Central to the approach is Contraction Theory, a modular stability +tool for nonlinear dynamical systems. The approach extends prior methods by +achieving parallel and sequential combinations of both discrete and rhythmic +movements, while enabling independent modulation of each movement. This modular +framework enables a divide-and-conquer strategy to simplify the programming of +complex robot motion planning. Simulation examples illustrate the flexibility +and versatility of the framework, highlighting its potential to address diverse +challenges in robot motion planning. + +
+
+ comment: 8 pages, 4 figures, submitted to Robotics and Automation Letters + (RA-L) for review +
+
+
+
+
+ + ☆ Estimation-Aware Trajectory Optimization with Set-Valued Measurement + Uncertainties + + +
+ In this paper, we present an optimization-based framework for generating +estimation-aware trajectories in scenarios where measurement (output) +uncertainties are state-dependent and set-valued. The framework leverages the +concept of regularity for set-valued output maps. Specifically, we demonstrate +that, for output-regular maps, one can utilize a set-valued observability +measure that is concave with respect to finite-horizon state trajectories. By +maximizing this measure, optimized estimation-aware trajectories can be +designed for a broad class of systems, including those with locally linearized +dynamics. To illustrate the effectiveness of the proposed approach, we provide +a representative example in the context of trajectory planning for vision-based +estimation. We present an estimation-aware trajectory for an uncooperative +target-tracking problem that uses a machine learning (ML)-based estimation +module on an ego-satellite. + +
+
+ comment: 25 pages, 5 figures +
+
+
+
+
+ + ☆ Embodied Scene Understanding for Vision Language Models via MetaVQA + + +
+ Vision Language Models (VLMs) demonstrate significant potential as embodied +AI agents for various mobility applications. However, a standardized, +closed-loop benchmark for evaluating their spatial reasoning and sequential +decision-making capabilities is lacking. To address this, we present MetaVQA: a +comprehensive benchmark designed to assess and enhance VLMs' understanding of +spatial relationships and scene dynamics through Visual Question Answering +(VQA) and closed-loop simulations. MetaVQA leverages Set-of-Mark prompting and +top-down view ground-truth annotations from nuScenes and Waymo datasets to +automatically generate extensive question-answer pairs based on diverse +real-world traffic scenarios, ensuring object-centric and context-rich +instructions. Our experiments show that fine-tuning VLMs with the MetaVQA +dataset significantly improves their spatial reasoning and embodied scene +comprehension in safety-critical simulations, evident not only in improved VQA +accuracies but also in emerging safety-aware driving maneuvers. In addition, +the learning demonstrates strong transferability from simulation to real-world +observation. Code and data will be publicly available at +https://metadriverse.github.io/metavqa . + +
+
+ comment: for the project webpage, see https://metadriverse.github.io/metavqa +
+
+
+
+
+ + ☆ AutoLoop: Fast Visual SLAM Fine-tuning through Agentic Curriculum + Learning + + +
+ Current visual SLAM systems face significant challenges in balancing +computational efficiency with robust loop closure handling. Traditional +approaches require careful manual tuning and incur substantial computational +overhead, while learning-based methods either lack explicit loop closure +capabilities or implement them through computationally expensive methods. We +present AutoLoop, a novel approach that combines automated curriculum learning +with efficient fine-tuning for visual SLAM systems. Our method employs a DDPG +(Deep Deterministic Policy Gradient) agent to dynamically adjust loop closure +weights during training, eliminating the need for manual hyperparameter search +while significantly reducing the required training steps. The approach +pre-computes potential loop closure pairs offline and leverages them through an +agent-guided curriculum, allowing the model to adapt efficiently to new +scenarios. Experiments conducted on TartanAir for training and validated across +multiple benchmarks including KITTI, EuRoC, ICL-NUIM and TUM RGB-D demonstrate +that AutoLoop achieves comparable or superior performance while reducing +training time by an order of magnitude compared to traditional approaches. +AutoLoop provides a practical solution for rapid adaptation of visual SLAM +systems, automating the weight tuning process that traditionally requires +multiple manual iterations. Our results show that this automated curriculum +strategy not only accelerates training but also maintains or improves the +model's performance across diverse environmental conditions. + +
+
+
+
+
+ + ♻ ☆ Real-World Evaluation of two Cooperative Intersection Management + Approaches + + +
+ Cooperative maneuver planning promises to significantly improve traffic +efficiency at unsignalized intersections by leveraging connected automated +vehicles. Previous works on this topic have been mostly developed for +completely automated traffic in a simple simulated environment. In contrast, +our previously introduced planning approaches are specifically designed to +handle real-world mixed traffic. The two methods are based on multi-scenario +prediction and graph-based reinforcement learning, respectively. This is the +first study to perform evaluations in a novel mixed traffic simulation +framework as well as real-world drives with prototype connected automated +vehicles in public traffic. The simulation features the same connected +automated driving software stack as deployed on one of the automated vehicles. +Our quantitative evaluations show that cooperative maneuver planning achieves a +substantial reduction in crossing times and the number of stops. In a realistic +environment with few automated vehicles, there are noticeable efficiency gains +with only slightly increasing criticality metrics. + +
+
+ comment: M. Klimke and M. B. Mertens are both first authors with equal + contribution. 10 pages, 9 figures, 3 tables, submitted to IEEE Intelligent + Transportation Systems Magazine +
+
+
+
+
+ + ♻ ☆ Learning Low-Dimensional Strain Models of Soft Robots by Looking at the + Evolution of Their Shape with Application to Model-Based Control + + +
+ Obtaining dynamic models of continuum soft robots is central to the analysis +and control of soft robots, and researchers have devoted much attention to the +challenge of proposing both data-driven and first-principle solutions. Both +avenues have, however, shown their limitations; the former lacks structure and +performs poorly outside training data, while the latter requires significant +simplifications and extensive expert knowledge to be used in practice. This +paper introduces a streamlined method for learning low-dimensional, +physics-based models that are both accurate and easy to interpret. We start +with an algorithm that uses image data (i.e., shape evolutions) to determine +the minimal necessary segments for describing a soft robot's movement. +Following this, we apply a dynamic regression and strain sparsification +algorithm to identify relevant strains and define the model's dynamics. We +validate our approach through simulations with various planar soft +manipulators, comparing its performance against other learning strategies, +showing that our models are both computationally efficient and 25x more +accurate on out-of-training distribution inputs. Finally, we demonstrate that +thanks to the capability of the method of generating physically compatible +models, the learned models can be straightforwardly combined with model-based +control policies. + +
+
+ comment: 8 pages, appearing in Proceedings of the 2025 IEEE 8th International + Conference on Soft Robotics (RoboSoft) +
+
+
+
+
+ + ♻ ☆ Evaluation of Artificial Intelligence Methods for Lead Time Prediction + in Non-Cycled Areas of Automotive Production + + +
+ The present study examines the effectiveness of applying Artificial +Intelligence methods in an automotive production environment to predict unknown +lead times in a non-cycle-controlled production area. Data structures are +analyzed to identify contextual features and then preprocessed using one-hot +encoding. Methods selection focuses on supervised machine learning techniques. +In supervised learning methods, regression and classification methods are +evaluated. Continuous regression based on target size distribution is not +feasible. Classification methods analysis shows that Ensemble Learning and +Support Vector Machines are the most suitable. Preliminary study results +indicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost +yield the best results. After further testing and extensive hyperparameter +optimization, the final method choice is the LightGBM algorithm. Depending on +feature availability and prediction interval granularity, relative prediction +accuracies of up to 90% can be achieved. Further tests highlight the importance +of periodic retraining of AI models to accurately represent complex production +processes using the database. The research demonstrates that AI methods can be +effectively applied to highly variable production data, adding business value +by providing an additional metric for various control tasks while outperforming +current non AI-based systems. + +
+
+
+
+
+ + ♻ ☆ Mind the Error! Detection and Localization of Instruction Errors in + Vision-and-Language Navigation IROS'24 + + +
+ Vision-and-Language Navigation in Continuous Environments (VLN-CE) is one of +the most intuitive yet challenging embodied AI tasks. Agents are tasked to +navigate towards a target goal by executing a set of low-level actions, +following a series of natural language instructions. All VLN-CE methods in the +literature assume that language instructions are exact. However, in practice, +instructions given by humans can contain errors when describing a spatial +environment due to inaccurate memory or confusion. Current VLN-CE benchmarks do +not address this scenario, making the state-of-the-art methods in VLN-CE +fragile in the presence of erroneous instructions from human users. For the +first time, we propose a novel benchmark dataset that introduces various types +of instruction errors considering potential human causes. This benchmark +provides valuable insight into the robustness of VLN systems in continuous +environments. We observe a noticeable performance drop (up to -25%) in Success +Rate when evaluating the state-of-the-art VLN-CE methods on our benchmark. +Moreover, we formally define the task of Instruction Error Detection and +Localization, and establish an evaluation protocol on top of our benchmark +dataset. We also propose an effective method, based on a cross-modal +transformer architecture, that achieves the best performance in error detection +and localization, compared to baselines. Surprisingly, our proposed method has +revealed errors in the validation set of the two commonly used datasets for +VLN-CE, i.e., R2R-CE and RxR-CE, demonstrating the utility of our technique in +other tasks. Code and dataset available at +https://intelligolabs.github.io/R2RIE-CE + +
+
+ comment: 3 figures, 8 pages. Accepted at IROS'24 +
+
+
+
+
+ + ♻ ☆ Reward-Driven Automated Curriculum Learning for Interaction-Aware + Self-Driving at Unsignalized Intersections + + +
+ In this work, we present a reward-driven automated curriculum reinforcement +learning approach for interaction-aware self-driving at unsignalized +intersections, taking into account the uncertainties associated with +surrounding vehicles (SVs). These uncertainties encompass the uncertainty of +SVs' driving intention and also the quantity of SVs. To deal with this problem, +the curriculum set is specifically designed to accommodate a progressively +increasing number of SVs. By implementing an automated curriculum selection +mechanism, the importance weights are rationally allocated across various +curricula, thereby facilitating improved sample efficiency and training +outcomes. Furthermore, the reward function is meticulously designed to guide +the agent towards effective policy exploration. Thus the proposed framework +could proactively address the above uncertainties at unsignalized intersections +by employing the automated curriculum learning technique that progressively +increases task difficulty, and this ensures safe self-driving through effective +interaction with SVs. Comparative experiments are conducted in $Highway\_Env$, +and the results indicate that our approach achieves the highest task success +rate, attains strong robustness to initialization parameters of the curriculum +selection module, and exhibits superior adaptability to diverse situational +configurations at unsignalized intersections. Furthermore, the effectiveness of +the proposed method is validated using the high-fidelity CARLA simulator. + +
+
+ comment: 8 pages, 6 figures, add grant information, minor textual polishing +
+
+
+
+
+ + ♻ ☆ ModCube: Modular, Self-Assembling Cubic Underwater Robot + + +
+ This paper presents a low-cost, centralized modular underwater robot +platform, ModCube, which can be used to study swarm coordination for a wide +range of tasks in underwater environments. A ModCube structure consists of +multiple ModCube robots. Each robot can move in six DoF with eight thrusters +and can be rigidly connected to other ModCube robots with an electromagnet +controlled by onboard computer. In this paper, we present a novel method for +characterizing and visualizing dynamic behavior, along with four benchmarks to +evaluate the morphological performance of the robot. Analysis shows that our +ModCube design is desirable for omnidirectional tasks, compared with the +configurations widely used by commercial underwater robots. We run real robot +experiments in two water tanks to demonstrate the robust control and +self-assemble of the proposed system, We also open-source the design and code +to facilitate future research. + +
+
+ comment: 8 pages, 8 figures, letter +
+
+
+
+
+ + ♻ ☆ RoboHorizon: An LLM-Assisted Multi-View World Model for Long-Horizon + Robotic Manipulation + + +
+ Efficient control in long-horizon robotic manipulation is challenging due to +complex representation and policy learning requirements. Model-based visual +reinforcement learning (RL) has shown great potential in addressing these +challenges but still faces notable limitations, particularly in handling sparse +rewards and complex visual features in long-horizon environments. To address +these limitations, we propose the Recognize-Sense-Plan-Act (RSPA) pipeline for +long-horizon tasks and further introduce RoboHorizon, an LLM-assisted +multi-view world model tailored for long-horizon robotic manipulation. In +RoboHorizon, pre-trained LLMs generate dense reward structures for multi-stage +sub-tasks based on task language instructions, enabling robots to better +recognize long-horizon tasks. Keyframe discovery is then integrated into the +multi-view masked autoencoder (MAE) architecture to enhance the robot's ability +to sense critical task sequences, strengthening its multi-stage perception of +long-horizon processes. Leveraging these dense rewards and multi-view +representations, a robotic world model is constructed to efficiently plan +long-horizon tasks, enabling the robot to reliably act through RL algorithms. +Experiments on two representative benchmarks, RLBench and FurnitureBench, show +that RoboHorizon outperforms state-of-the-art visual model-based RL methods, +achieving a 23.35% improvement in task success rates on RLBench's 4 +short-horizon tasks and a 29.23% improvement on 6 long-horizon tasks from +RLBench and 3 furniture assembly tasks from FurnitureBench. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ On the Surprising Effectiveness of Spectrum Clipping in Learning Stable + Linear Dynamics + + +
+ When learning stable linear dynamical systems from data, three important +properties are desirable: i) predictive accuracy, ii) provable stability, and +iii) computational efficiency. Unconstrained minimization of reconstruction +errors leads to high accuracy and efficiency but cannot guarantee stability. +Existing methods to remedy this focus on enforcing stability while also +ensuring accuracy, but do so only at the cost of increased computation. In this +work, we investigate if a straightforward approach can simultaneously offer all +three desiderata of learning stable linear systems. Specifically, we consider a +post-hoc approach that manipulates the spectrum of the learned system matrix +after it is learned in an unconstrained fashion. We call this approach spectrum +clipping (SC) as it involves eigen decomposition and subsequent reconstruction +of the system matrix after clipping all of its eigenvalues that are larger than +one to one (without altering the eigenvectors). Through detailed experiments +involving two different applications and publicly available benchmark datasets, +we demonstrate that this simple technique can simultaneously learn highly +accurate linear systems that are provably stable. Notably, we demonstrate that +SC can achieve similar or better performance than strong baselines while being +orders-of-magnitude faster. We also show that SC can be readily combined with +Koopman operators to learn stable nonlinear dynamics, such as those underlying +complex dexterous manipulation skills involving multi-fingered robotic hands. +Further, we find that SC can learn stable robot policies even when the training +data includes unsuccessful or truncated demonstrations. Our codes and dataset +can be found at https://github.com/GT-STAR-Lab/spec_clip. + +
+
+ comment: Under review by L4DC 2025 +
+
+
+
+
+ + ♻ ☆ Experimental Study on The Effect of Multi-step Deep Reinforcement + Learning in POMDPs + + +
+ Deep Reinforcement Learning (DRL) has made tremendous advances in both +simulated and real-world robot control tasks in recent years. This is +particularly the case for tasks that can be carefully engineered with a full +state representation, and which can then be formulated as a Markov Decision +Process (MDP). However, applying DRL strategies designed for MDPs to novel +robot control tasks can be challenging, because the available observations may +be a partial representation of the state, resulting in a Partially Observable +Markov Decision Process (POMDP). This paper considers three popular DRL +algorithms, namely Proximal Policy Optimization (PPO), Twin Delayed Deep +Deterministic Policy Gradient (TD3), and Soft Actor-Critic (SAC), invented for +MDPs, and studies their performance in POMDP scenarios. While prior work has +found that SAC and TD3 typically outperform PPO across a broad range of tasks +that can be represented as MDPs, we show that this is not always the case, +using three representative POMDP environments. Empirical studies show that this +is related to multi-step bootstrapping, where multi-step immediate rewards, +instead of one-step immediate reward, are used to calculate the target value +estimation of an observation and action pair. We identify this by observing +that the inclusion of multi-step bootstrapping in TD3 (MTD3) and SAC (MSAC) +results in improved robustness in POMDP settings. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 125 + +
+
+
+ + ☆ Ouroboros-Diffusion: Exploring Consistent Content Generation in + Tuning-free Long Video Diffusion + + +
+ The first-in-first-out (FIFO) video diffusion, built on a pre-trained +text-to-video model, has recently emerged as an effective approach for +tuning-free long video generation. This technique maintains a queue of video +frames with progressively increasing noise, continuously producing clean frames +at the queue's head while Gaussian noise is enqueued at the tail. However, +FIFO-Diffusion often struggles to keep long-range temporal consistency in the +generated videos due to the lack of correspondence modeling across frames. In +this paper, we propose Ouroboros-Diffusion, a novel video denoising framework +designed to enhance structural and content (subject) consistency, enabling the +generation of consistent videos of arbitrary length. Specifically, we introduce +a new latent sampling technique at the queue tail to improve structural +consistency, ensuring perceptually smooth transitions among frames. To enhance +subject consistency, we devise a Subject-Aware Cross-Frame Attention (SACFA) +mechanism, which aligns subjects across frames within short segments to achieve +better visual coherence. Furthermore, we introduce self-recurrent guidance. +This technique leverages information from all previous cleaner frames at the +front of the queue to guide the denoising of noisier frames at the end, +fostering rich and contextual global information interaction. Extensive +experiments of long video generation on the VBench benchmark demonstrate the +superiority of our Ouroboros-Diffusion, particularly in terms of subject +consistency, motion smoothness, and temporal consistency. + +
+
+
+
+
+ + ☆ Multimodal LLMs Can Reason about Aesthetics in Zero-Shot + + +
+ We present the first study on how Multimodal LLMs' (MLLMs) reasoning ability +shall be elicited to evaluate the aesthetics of artworks. To facilitate this +investigation, we construct MM-StyleBench, a novel high-quality dataset for +benchmarking artistic stylization. We then develop a principled method for +human preference modeling and perform a systematic correlation analysis between +MLLMs' responses and human preference. Our experiments reveal an inherent +hallucination issue of MLLMs in art evaluation, associated with response +subjectivity. ArtCoT is proposed, demonstrating that art-specific task +decomposition and the use of concrete language boost MLLMs' reasoning ability +for aesthetics. Our findings offer valuable insights into MLLMs for art and can +benefit a wide range of downstream applications, such as style transfer and +artistic image generation. Code available at +https://github.com/songrise/MLLM4Art. + +
+
+ comment: WIP, Homepage https://github.com/songrise/MLLM4Art +
+
+
+
+
+ + ☆ SimGen: A Diffusion-Based Framework for Simultaneous Surgical Image and + Segmentation Mask Generation + + +
+ Acquiring and annotating surgical data is often resource-intensive, ethical +constraining, and requiring significant expert involvement. While generative AI +models like text-to-image can alleviate data scarcity, incorporating spatial +annotations, such as segmentation masks, is crucial for precision-driven +surgical applications, simulation, and education. This study introduces both a +novel task and method, SimGen, for Simultaneous Image and Mask Generation. +SimGen is a diffusion model based on the DDPM framework and Residual U-Net, +designed to jointly generate high-fidelity surgical images and their +corresponding segmentation masks. The model leverages cross-correlation priors +to capture dependencies between continuous image and discrete mask +distributions. Additionally, a Canonical Fibonacci Lattice (CFL) is employed to +enhance class separability and uniformity in the RGB space of the masks. SimGen +delivers high-fidelity images and accurate segmentation masks, outperforming +baselines across six public datasets assessed on image and semantic inception +distance metrics. Ablation study shows that the CFL improves mask quality and +spatial separation. Downstream experiments suggest generated image-mask pairs +are usable if regulations limit human data release for research. This work +offers a cost-effective solution for generating paired surgical images and +complex labels, advancing surgical AI development by reducing the need for +expensive manual annotations. + +
+
+ comment: 12 pages, 17 figures, 4 tables, project page at + https://camma-public.github.io/endogen/ +
+
+
+
+
+ + ☆ Vision Foundation Models for Computed Tomography + + +
+ Foundation models (FMs) have shown transformative potential in radiology by +performing diverse, complex tasks across imaging modalities. Here, we developed +CT-FM, a large-scale 3D image-based pre-trained model designed explicitly for +various radiological tasks. CT-FM was pre-trained using 148,000 computed +tomography (CT) scans from the Imaging Data Commons through label-agnostic +contrastive learning. We evaluated CT-FM across four categories of tasks, +namely, whole-body and tumor segmentation, head CT triage, medical image +retrieval, and semantic understanding, showing superior performance against +state-of-the-art models. Beyond quantitative success, CT-FM demonstrated the +ability to cluster regions anatomically and identify similar anatomical and +structural concepts across scans. Furthermore, it remained robust across +test-retest settings and indicated reasonable salient regions attached to its +embeddings. This study demonstrates the value of large-scale medical imaging +foundation models and by open-sourcing the model weights, code, and data, aims +to support more adaptable, reliable, and interpretable AI solutions in +radiology. + +
+
+ comment: 6 figures, followed by 9 Extended Data Figures and a Supplementary + Information document +
+
+
+
+
+ + ☆ RepVideo: Rethinking Cross-Layer Representation for Video Generation + + +
+ Video generation has achieved remarkable progress with the introduction of +diffusion models, which have significantly improved the quality of generated +videos. However, recent research has primarily focused on scaling up model +training, while offering limited insights into the direct impact of +representations on the video generation process. In this paper, we initially +investigate the characteristics of features in intermediate layers, finding +substantial variations in attention maps across different layers. These +variations lead to unstable semantic representations and contribute to +cumulative differences between features, which ultimately reduce the similarity +between adjacent frames and negatively affect temporal coherence. To address +this, we propose RepVideo, an enhanced representation framework for +text-to-video diffusion models. By accumulating features from neighboring +layers to form enriched representations, this approach captures more stable +semantic information. These enhanced representations are then used as inputs to +the attention mechanism, thereby improving semantic expressiveness while +ensuring feature consistency across adjacent frames. Extensive experiments +demonstrate that our RepVideo not only significantly enhances the ability to +generate accurate spatial appearances, such as capturing complex spatial +relationships between multiple objects, but also improves temporal consistency +in video generation. + +
+
+ comment: Project page: https://vchitect.github.io/RepVid-Webpage +
+
+
+
+
+ + ☆ CityDreamer4D: Compositional Generative Model of Unbounded 4D Cities + + +
+ 3D scene generation has garnered growing attention in recent years and has +made significant progress. Generating 4D cities is more challenging than 3D +scenes due to the presence of structurally complex, visually diverse objects +like buildings and vehicles, and heightened human sensitivity to distortions in +urban environments. To tackle these issues, we propose CityDreamer4D, a +compositional generative model specifically tailored for generating unbounded +4D cities. Our main insights are 1) 4D city generation should separate dynamic +objects (e.g., vehicles) from static scenes (e.g., buildings and roads), and 2) +all objects in the 4D scene should be composed of different types of neural +fields for buildings, vehicles, and background stuff. Specifically, we propose +Traffic Scenario Generator and Unbounded Layout Generator to produce dynamic +traffic scenarios and static city layouts using a highly compact BEV +representation. Objects in 4D cities are generated by combining stuff-oriented +and instance-oriented neural fields for background stuff, buildings, and +vehicles. To suit the distinct characteristics of background stuff and +instances, the neural fields employ customized generative hash grids and +periodic positional embeddings as scene parameterizations. Furthermore, we +offer a comprehensive suite of datasets for city generation, including OSM, +GoogleEarth, and CityTopia. The OSM dataset provides a variety of real-world +city layouts, while the Google Earth and CityTopia datasets deliver +large-scale, high-quality city imagery complete with 3D instance annotations. +Leveraging its compositional design, CityDreamer4D supports a range of +downstream applications, such as instance editing, city stylization, and urban +simulation, while delivering state-of-the-art performance in generating +realistic 4D cities. + +
+
+
+
+
+ + ☆ CityLoc: 6 DoF Localization of Text Descriptions in Large-Scale Scenes + with Gaussian Representation + + +
+ Localizing text descriptions in large-scale 3D scenes is inherently an +ambiguous task. This nonetheless arises while describing general concepts, e.g. +all traffic lights in a city. + To facilitate reasoning based on such concepts, text localization in the form +of distribution is required. In this paper, we generate the distribution of the +camera poses conditioned upon the textual description. + To facilitate such generation, we propose a diffusion-based architecture that +conditionally diffuses the noisy 6DoF camera poses to their plausible +locations. + The conditional signals are derived from the text descriptions, using the +pre-trained text encoders. The connection between text descriptions and pose +distribution is established through pretrained Vision-Language-Model, i.e. +CLIP. Furthermore, we demonstrate that the candidate poses for the distribution +can be further refined by rendering potential poses using 3D Gaussian +splatting, guiding incorrectly posed samples towards locations that better +align with the textual description, through visual reasoning. + We demonstrate the effectiveness of our method by comparing it with both +standard retrieval methods and learning-based approaches. Our proposed method +consistently outperforms these baselines across all five large-scale datasets. +Our source code and dataset will be made publicly available. + +
+
+
+
+
+ + ☆ An analysis of data variation and bias in image-based dermatological + datasets for machine learning classification + + +
+ AI algorithms have become valuable in aiding professionals in healthcare. The +increasing confidence obtained by these models is helpful in critical decision +demands. In clinical dermatology, classification models can detect malignant +lesions on patients' skin using only RGB images as input. However, most +learning-based methods employ data acquired from dermoscopic datasets on +training, which are large and validated by a gold standard. Clinical models aim +to deal with classification on users' smartphone cameras that do not contain +the corresponding resolution provided by dermoscopy. Also, clinical +applications bring new challenges. It can contain captures from uncontrolled +environments, skin tone variations, viewpoint changes, noises in data and +labels, and unbalanced classes. A possible alternative would be to use transfer +learning to deal with the clinical images. However, as the number of samples is +low, it can cause degradations on the model's performance; the source +distribution used in training differs from the test set. This work aims to +evaluate the gap between dermoscopic and clinical samples and understand how +the dataset variations impact training. It assesses the main differences +between distributions that disturb the model's prediction. Finally, from +experiments on different architectures, we argue how to combine the data from +divergent distributions, decreasing the impact on the model's final accuracy. + +
+
+ comment: 10 pages, 1 figure +
+
+
+
+
+ + ☆ Visual WetlandBirds Dataset: Bird Species Identification and Behavior + Recognition in Videos + + +
+ The current biodiversity loss crisis makes animal monitoring a relevant field +of study. In light of this, data collected through monitoring can provide +essential insights, and information for decision-making aimed at preserving +global biodiversity. Despite the importance of such data, there is a notable +scarcity of datasets featuring videos of birds, and none of the existing +datasets offer detailed annotations of bird behaviors in video format. In +response to this gap, our study introduces the first fine-grained video dataset +specifically designed for bird behavior detection and species classification. +This dataset addresses the need for comprehensive bird video datasets and +provides detailed data on bird actions, facilitating the development of deep +learning models to recognize these, similar to the advancements made in human +action recognition. The proposed dataset comprises 178 videos recorded in +Spanish wetlands, capturing 13 different bird species performing 7 distinct +behavior classes. In addition, we also present baseline results using state of +the art models on two tasks: bird behavior recognition and species +classification. + +
+
+
+
+
+ + ☆ Learning Joint Denoising, Demosaicing, and Compression from the Raw + Natural Image Noise Dataset + + +
+ This paper introduces the Raw Natural Image Noise Dataset (RawNIND), a +diverse collection of paired raw images designed to support the development of +denoising models that generalize across sensors, image development workflows, +and styles. Two denoising methods are proposed: one operates directly on raw +Bayer data, leveraging computational efficiency, while the other processes +linear RGB images for improved generalization to different sensors, with both +preserving flexibility for subsequent development. Both methods outperform +traditional approaches which rely on developed images. Additionally, the +integration of denoising and compression at the raw data level significantly +enhances rate-distortion performance and computational efficiency. These +findings suggest a paradigm shift toward raw data workflows for efficient and +flexible image processing. + +
+
+
+
+
+ + ☆ Empowering Agricultural Insights: RiceLeafBD - A Novel Dataset and + Optimal Model Selection for Rice Leaf Disease Diagnosis through Transfer + Learning Technique + + +
+ The number of people living in this agricultural nation of ours, which is +surrounded by lush greenery, is growing on a daily basis. As a result of this, +the level of arable land is decreasing, as well as residential houses and +industrial factories. The food crisis is becoming the main threat for us in the +upcoming days. Because on the one hand, the population is increasing, and on +the other hand, the amount of food crop production is decreasing due to the +attack of diseases. Rice is one of the most significant cultivated crops since +it provides food for more than half of the world's population. Bangladesh is +dependent on rice (Oryza sativa) as a vital crop for its agriculture, but it +faces a significant problem as a result of the ongoing decline in rice yield +brought on by common diseases. Early disease detection is the main difficulty +in rice crop cultivation. In this paper, we proposed our own dataset, which was +collected from the Bangladesh field, and also applied deep learning and +transfer learning models for the evaluation of the datasets. We elaborately +explain our dataset and also give direction for further research work to serve +society using this dataset. We applied a light CNN model and pre-trained +InceptionNet-V2, EfficientNet-V2, and MobileNet-V2 models, which achieved 91.5% +performance for the EfficientNet-V2 model of this work. The results obtained +assaulted other models and even exceeded approaches that are considered to be +part of the state of the art. It has been demonstrated by this study that it is +possible to precisely and effectively identify diseases that affect rice leaves +using this unbiased datasets. After analysis of the performance of different +models, the proposed datasets are significant for the society for research work +to provide solutions for decreasing rice leaf disease. + +
+
+
+
+
+ + ☆ Lights, Camera, Matching: The Role of Image Illumination in Fair Face + Recognition + + +
+ Facial brightness is a key image quality factor impacting face recognition +accuracy differentials across demographic groups. In this work, we aim to +decrease the accuracy gap between the similarity score distributions for +Caucasian and African American female mated image pairs, as measured by d' +between distributions. To balance brightness across demographic groups, we +conduct three experiments, interpreting brightness in the face skin region +either as median pixel value or as the distribution of pixel values. Balancing +based on median brightness alone yields up to a 46.8% decrease in d', while +balancing based on brightness distribution yields up to a 57.6% decrease. In +all three cases, the similarity scores of the individual distributions improve, +with mean scores maximally improving 5.9% for Caucasian females and 3.7% for +African American females. + +
+
+ comment: 14 pages, 11 figures, Conference submission +
+
+
+
+
+ + ☆ Multi-View Transformers for Airway-To-Lung Ratio Inference on Cardiac CT + Scans: The C4R Study + + +
+ The ratio of airway tree lumen to lung size (ALR), assessed at full +inspiration on high resolution full-lung computed tomography (CT), is a major +risk factor for chronic obstructive pulmonary disease (COPD). There is growing +interest to infer ALR from cardiac CT images, which are widely available in +epidemiological cohorts, to investigate the relationship of ALR to severe +COVID-19 and post-acute sequelae of SARS-CoV-2 infection (PASC). Previously, +cardiac scans included approximately 2/3 of the total lung volume with 5-6x +greater slice thickness than high-resolution (HR) full-lung (FL) CT. In this +study, we present a novel attention-based Multi-view Swin Transformer to infer +FL ALR values from segmented cardiac CT scans. For the supervised training we +exploit paired full-lung and cardiac CTs acquired in the Multi-Ethnic Study of +Atherosclerosis (MESA). Our network significantly outperforms a proxy direct +ALR inference on segmented cardiac CT scans and achieves accuracy and +reproducibility comparable with a scan-rescan reproducibility of the FL ALR +ground-truth. + +
+
+ comment: Accepted to appear in Proceedings of International Symposium on + Biomedical Imaging (ISBI), 2025 +
+
+
+
+
+ + ☆ Enhanced Multi-Scale Cross-Attention for Person Image Generation TPAMI + + +
+ In this paper, we propose a novel cross-attention-based generative +adversarial network (GAN) for the challenging person image generation task. +Cross-attention is a novel and intuitive multi-modal fusion method in which an +attention/correlation matrix is calculated between two feature maps of +different modalities. Specifically, we propose the novel XingGAN (or +CrossingGAN), which consists of two generation branches that capture the +person's appearance and shape, respectively. Moreover, we propose two novel +cross-attention blocks to effectively transfer and update the person's shape +and appearance embeddings for mutual improvement. This has not been considered +by any other existing GAN-based image generation work. To further learn the +long-range correlations between different person poses at different scales and +sub-regions, we propose two novel multi-scale cross-attention blocks. To tackle +the issue of independent correlation computations within the cross-attention +mechanism leading to noisy and ambiguous attention weights, which hinder +performance improvements, we propose a module called enhanced attention (EA). +Lastly, we introduce a novel densely connected co-attention module to fuse +appearance and shape features at different stages effectively. Extensive +experiments on two public datasets demonstrate that the proposed method +outperforms current GAN-based methods and performs on par with diffusion-based +methods. However, our method is significantly faster than diffusion-based +methods in both training and inference. + +
+
+ comment: Accepted to TPAMI, an extended version of a paper published in + ECCV2020. arXiv admin note: substantial text overlap with arXiv:2007.09278 +
+
+
+
+
+ + ☆ Feature-based One-For-All: A Universal Framework for Heterogeneous + Knowledge Distillation + + +
+ Knowledge distillation (KD) involves transferring knowledge from a +pre-trained heavy teacher model to a lighter student model, thereby reducing +the inference cost while maintaining comparable effectiveness. Prior KD +techniques typically assume homogeneity between the teacher and student models. +However, as technology advances, a wide variety of architectures have emerged, +ranging from initial Convolutional Neural Networks (CNNs) to Vision +Transformers (ViTs), and Multi-Level Perceptrons (MLPs). Consequently, +developing a universal KD framework compatible with any architecture has become +an important research topic. In this paper, we introduce a feature-based +one-for-all (FOFA) KD framework to enable feature distillation across diverse +architecture. Our framework comprises two key components. First, we design +prompt tuning blocks that incorporate student feedback, allowing teacher +features to adapt to the student model's learning process. Second, we propose +region-aware attention to mitigate the view mismatch problem between +heterogeneous architecture. By leveraging these two modules, effective +distillation of intermediate features can be achieved across heterogeneous +architectures. Extensive experiments on CIFAR, ImageNet, and COCO demonstrate +the superiority of the proposed method. + +
+
+
+
+
+ + ☆ Generative Planning with 3D-vision Language Pre-training for End-to-End + Autonomous Driving + + +
+ Autonomous driving is a challenging task that requires perceiving and +understanding the surrounding environment for safe trajectory planning. While +existing vision-based end-to-end models have achieved promising results, these +methods are still facing the challenges of vision understanding, decision +reasoning and scene generalization. To solve these issues, a generative +planning with 3D-vision language pre-training model named GPVL is proposed for +end-to-end autonomous driving. The proposed paradigm has two significant +aspects. On one hand, a 3D-vision language pre-training module is designed to +bridge the gap between visual perception and linguistic understanding in the +bird's eye view. On the other hand, a cross-modal language model is introduced +to generate holistic driving decisions and fine-grained trajectories with +perception and navigation information in an auto-regressive manner. Experiments +on the challenging nuScenes dataset demonstrate that the proposed scheme +achieves excellent performances compared with state-of-the-art methods. +Besides, the proposed GPVL presents strong generalization ability and real-time +potential when handling high-level commands in various scenarios. It is +believed that the effective, robust and efficient performance of GPVL is +crucial for the practical application of future autonomous driving systems. +Code is available at https://github.com/ltp1995/GPVL + +
+
+
+
+
+ + ☆ Exploring Task-Level Optimal Prompts for Visual In-Context Learning + + +
+ With the development of Vision Foundation Models (VFMs) in recent years, +Visual In-Context Learning (VICL) has become a better choice compared to +modifying models in most scenarios. Different from retraining or fine-tuning +model, VICL does not require modifications to the model's weights or +architecture, and only needs a prompt with demonstrations to teach VFM how to +solve tasks. Currently, significant computational cost for finding optimal +prompts for every test sample hinders the deployment of VICL, as determining +which demonstrations to use for constructing prompts is very costly. In this +paper, however, we find a counterintuitive phenomenon that most test samples +actually achieve optimal performance under the same prompts, and searching for +sample-level prompts only costs more time but results in completely identical +prompts. Therefore, we propose task-level prompting to reduce the cost of +searching for prompts during the inference stage and introduce two time-saving +yet effective task-level prompt search strategies. Extensive experimental +results show that our proposed method can identify near-optimal prompts and +reach the best VICL performance with a minimal cost that prior work has never +achieved. + +
+
+
+
+
+ + ☆ MANTA: Diffusion Mamba for Efficient and Effective Stochastic Long-Term + Dense Anticipation + + +
+ Our work addresses the problem of stochastic long-term dense anticipation. +The goal of this task is to predict actions and their durations several minutes +into the future based on provided video observations. Anticipation over +extended horizons introduces high uncertainty, as a single observation can lead +to multiple plausible future outcomes. To address this uncertainty, stochastic +models are designed to predict several potential future action sequences. +Recent work has further proposed to incorporate uncertainty modelling for +observed frames by simultaneously predicting per-frame past and future actions +in a unified manner. While such joint modelling of actions is beneficial, it +requires long-range temporal capabilities to connect events across distant past +and future time points. However, the previous work struggles to achieve such a +long-range understanding due to its limited and/or sparse receptive field. To +alleviate this issue, we propose a novel MANTA (MAmba for ANTicipation) +network. Our model enables effective long-term temporal modelling even for very +long sequences while maintaining linear complexity in sequence length. We +demonstrate that our approach achieves state-of-the-art results on three +datasets - Breakfast, 50Salads, and Assembly101 - while also significantly +improving computational and memory efficiency. + +
+
+
+
+
+ + ☆ MMDocIR: Benchmarking Multi-Modal Retrieval for Long Documents + + +
+ Multi-modal document retrieval is designed to identify and retrieve various +forms of multi-modal content, such as figures, tables, charts, and layout +information from extensive documents. Despite its significance, there is a +notable lack of a robust benchmark to effectively evaluate the performance of +systems in multi-modal document retrieval. To address this gap, this work +introduces a new benchmark, named as MMDocIR, encompassing two distinct tasks: +page-level and layout-level retrieval. The former focuses on localizing the +most relevant pages within a long document, while the latter targets the +detection of specific layouts, offering a more fine-grained granularity than +whole-page analysis. A layout can refer to a variety of elements such as +textual paragraphs, equations, figures, tables, or charts. The MMDocIR +benchmark comprises a rich dataset featuring expertly annotated labels for +1,685 questions and bootstrapped labels for 173,843 questions, making it a +pivotal resource for advancing multi-modal document retrieval for both training +and evaluation. Through rigorous experiments, we reveal that (i) visual +retrievers significantly outperform their text counterparts, (ii) MMDocIR train +set can effectively benefit the training process of multi-modal document +retrieval and (iii) text retrievers leveraging on VLM-text perform much better +than those using OCR-text. These findings underscores the potential advantages +of integrating visual elements for multi-modal document retrieval. + +
+
+ comment: https://huggingface.co/MMDocIR +
+
+
+
+
+ + ☆ Boosting Diffusion Guidance via Learning Degradation-Aware Models for + Blind Super Resolution + + +
+ Recently, diffusion-based blind super-resolution (SR) methods have shown +great ability to generate high-resolution images with abundant high-frequency +detail, but the detail is often achieved at the expense of fidelity. Meanwhile, +another line of research focusing on rectifying the reverse process of +diffusion models (i.e., diffusion guidance), has demonstrated the power to +generate high-fidelity results for non-blind SR. However, these methods rely on +known degradation kernels, making them difficult to apply to blind SR. To +address these issues, we introduce degradation-aware models that can be +integrated into the diffusion guidance framework, eliminating the need to know +degradation kernels. Additionally, we propose two novel techniques input +perturbation and guidance scalar to further improve our performance. Extensive +experimental results show that our proposed method has superior performance +over state-of-the-art methods on blind SR benchmarks + +
+
+ comment: To appear in WACV 2025. Code is available at: + https://github.com/ryanlu2240/Boosting-Diffusion-Guidance-via-Learning-Degradation-Aware-Models-for-Blind-Super-Resolution +
+
+
+
+
+ + ☆ IDEA: Image Description Enhanced CLIP-Adapter + + +
+ CLIP (Contrastive Language-Image Pre-training) has attained great success in +pattern recognition and computer vision. Transferring CLIP to downstream tasks +(e.g. zero- or few-shot classification) is a hot topic in multimodal learning. +However, current studies primarily focus on either prompt learning for text or +adapter tuning for vision, without fully exploiting the complementary +information and correlations among image-text pairs. In this paper, we propose +an Image Description Enhanced CLIP-Adapter (IDEA) method to adapt CLIP to +few-shot image classification tasks. This method captures fine-grained features +by leveraging both visual features and textual descriptions of images. IDEA is +a training-free method for CLIP, and it can be comparable to or even exceeds +state-of-the-art models on multiple tasks. Furthermore, we introduce +Trainable-IDEA (T-IDEA), which extends IDEA by adding two lightweight learnable +components (i.e., a projector and a learnable latent space), further enhancing +the model's performance and achieving SOTA results on 11 datasets. As one +important contribution, we employ the Llama model and design a comprehensive +pipeline to generate textual descriptions for images of 11 datasets, resulting +in a total of 1,637,795 image-text pairs, named "IMD-11". Our code and data are +released at https://github.com/FourierAI/IDEA. + +
+
+
+
+
+ + ☆ Human Pose-Constrained UV Map Estimation + + +
+ UV map estimation is used in computer vision for detailed analysis of human +posture or activity. Previous methods assign pixels to body model vertices by +comparing pixel descriptors independently, without enforcing global coherence +or plausibility in the UV map. We propose Pose-Constrained Continuous Surface +Embeddings (PC-CSE), which integrates estimated 2D human pose into the +pixel-to-vertex assignment process. The pose provides global anatomical +constraints, ensuring that UV maps remain coherent while preserving local +precision. Evaluation on DensePose COCO demonstrates consistent improvement, +regardless of the chosen 2D human pose model. Whole-body poses offer better +constraints by incorporating additional details about the hands and feet. +Conditioning UV maps with human pose reduces invalid mappings and enhances +anatomical plausibility. In addition, we highlight inconsistencies in the +ground-truth annotations. + +
+
+
+
+
+ + ☆ Multi-visual modality micro drone-based structural damage detection + + +
+ Accurate detection and resilience of object detectors in structural damage +detection are important in ensuring the continuous use of civil infrastructure. +However, achieving robustness in object detectors remains a persistent +challenge, impacting their ability to generalize effectively. This study +proposes DetectorX, a robust framework for structural damage detection coupled +with a micro drone. DetectorX addresses the challenges of object detector +robustness by incorporating two innovative modules: a stem block and a spiral +pooling technique. The stem block introduces a dynamic visual modality by +leveraging the outputs of two Deep Convolutional Neural Network (DCNN) models. +The framework employs the proposed event-based reward reinforcement learning to +constrain the actions of a parent and child DCNN model leading to a reward. +This results in the induction of two dynamic visual modalities alongside the +Red, Green, and Blue (RGB) data. This enhancement significantly augments +DetectorX's perception and adaptability in diverse environmental situations. +Further, a spiral pooling technique, an online image augmentation method, +strengthens the framework by increasing feature representations by +concatenating spiraled and average/max pooled features. In three extensive +experiments: (1) comparative and (2) robustness, which use the Pacific +Earthquake Engineering Research Hub ImageNet dataset, and (3) field-experiment, +DetectorX performed satisfactorily across varying metrics, including precision +(0.88), recall (0.84), average precision (0.91), mean average precision (0.76), +and mean average recall (0.73), compared to the competing detectors including +You Only Look Once X-medium (YOLOX-m) and others. The study's findings indicate +that DetectorX can provide satisfactory results and demonstrate resilience in +challenging environments. + +
+
+
+
+
+ + ☆ Exploring ChatGPT for Face Presentation Attack Detection in Zero and + Few-Shot in-Context Learning + + +
+ This study highlights the potential of ChatGPT (specifically GPT-4o) as a +competitive alternative for Face Presentation Attack Detection (PAD), +outperforming several PAD models, including commercial solutions, in specific +scenarios. Our results show that GPT-4o demonstrates high consistency, +particularly in few-shot in-context learning, where its performance improves as +more examples are provided (reference data). We also observe that detailed +prompts enable the model to provide scores reliably, a behavior not observed +with concise prompts. Additionally, explanation-seeking prompts slightly +enhance the model's performance by improving its interpretability. Remarkably, +the model exhibits emergent reasoning capabilities, correctly predicting the +attack type (print or replay) with high accuracy in few-shot scenarios, despite +not being explicitly instructed to classify attack types. Despite these +strengths, GPT-4o faces challenges in zero-shot tasks, where its performance is +limited compared to specialized PAD systems. Experiments were conducted on a +subset of the SOTERIA dataset, ensuring compliance with data privacy +regulations by using only data from consenting individuals. These findings +underscore GPT-4o's promise in PAD applications, laying the groundwork for +future research to address broader data privacy concerns and improve +cross-dataset generalization. Code available here: +https://gitlab.idiap.ch/bob/bob.paper.wacv2025_chatgpt_face_pad + +
+
+ comment: Accepted in WACV workshop 2025 +
+
+
+
+
+ + ☆ Admitting Ignorance Helps the Video Question Answering Models to Answer + + +
+ Significant progress has been made in the field of video question answering +(VideoQA) thanks to deep learning and large-scale pretraining. Despite the +presence of sophisticated model structures and powerful video-text foundation +models, most existing methods focus solely on maximizing the correlation +between answers and video-question pairs during training. We argue that these +models often establish shortcuts, resulting in spurious correlations between +questions and answers, especially when the alignment between video and text +data is suboptimal. To address these spurious correlations, we propose a novel +training framework in which the model is compelled to acknowledge its ignorance +when presented with an intervened question, rather than making guesses solely +based on superficial question-answer correlations. We introduce methodologies +for intervening in questions, utilizing techniques such as displacement and +perturbation, and design frameworks for the model to admit its lack of +knowledge in both multi-choice VideoQA and open-ended settings. In practice, we +integrate a state-of-the-art model into our framework to validate its +effectiveness. The results clearly demonstrate that our framework can +significantly enhance the performance of VideoQA models with minimal structural +modifications. + +
+
+
+
+
+ + ☆ Few-Shot Learner Generalizes Across AI-Generated Image Detection + + +
+ Current fake image detectors trained on large synthetic image datasets +perform satisfactorily on limited studied generative models. However, they +suffer a notable performance decline over unseen models. Besides, collecting +adequate training data from online generative models is often expensive or +infeasible. To overcome these issues, we propose Few-Shot Detector (FSD), a +novel AI-generated image detector which learns a specialized metric space to +effectively distinguish unseen fake images by utilizing very few samples. +Experiments show FSD achieves state-of-the-art performance by $+7.4\%$ average +ACC on GenImage dataset. More importantly, our method is better capable of +capturing the intra-category common features in unseen images without further +training. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ $\texttt{InfoHier}$: Hierarchical Information Extraction via Encoding + and Embedding + + +
+ Analyzing large-scale datasets, especially involving complex and +high-dimensional data like images, is particularly challenging. While +self-supervised learning (SSL) has proven effective for learning +representations from unlabelled data, it typically focuses on flat, +non-hierarchical structures, missing the multi-level relationships present in +many real-world datasets. Hierarchical clustering (HC) can uncover these +relationships by organizing data into a tree-like structure, but it often +relies on rigid similarity metrics that struggle to capture the complexity of +diverse data types. To address these we envision $\texttt{InfoHier}$, a +framework that combines SSL with HC to jointly learn robust latent +representations and hierarchical structures. This approach leverages SSL to +provide adaptive representations, enhancing HC's ability to capture complex +patterns. Simultaneously, it integrates HC loss to refine SSL training, +resulting in representations that are more attuned to the underlying +information hierarchy. $\texttt{InfoHier}$ has the potential to improve the +expressiveness and performance of both clustering and representation learning, +offering significant benefits for data analysis, management, and information +retrieval. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ Self-supervised Transformation Learning for Equivariant Representations NeurIPS + 2024 + + +
+ Unsupervised representation learning has significantly advanced various +machine learning tasks. In the computer vision domain, state-of-the-art +approaches utilize transformations like random crop and color jitter to achieve +invariant representations, embedding semantically the same inputs despite +transformations. However, this can degrade performance in tasks requiring +precise features, such as localization or flower classification. To address +this, recent research incorporates equivariant representation learning, which +captures transformation-sensitive information. However, current methods depend +on transformation labels and thus struggle with interdependency and complex +transformations. We propose Self-supervised Transformation Learning (STL), +replacing transformation labels with transformation representations derived +from image pairs. The proposed method ensures transformation representation is +image-invariant and learns corresponding equivariant transformations, enhancing +performance without increased batch complexity. We demonstrate the approach's +effectiveness across diverse classification and detection tasks, outperforming +existing methods in 7 out of 11 benchmarks and excelling in detection. By +integrating complex transformations like AugMix, unusable by prior equivariant +methods, this approach enhances performance across tasks, underscoring its +adaptability and resilience. Additionally, its compatibility with various base +models highlights its flexibility and broad applicability. The code is +available at https://github.com/jaemyung-u/stl. + +
+
+ comment: 38th Conference on Neural Information Processing Systems (NeurIPS + 2024) +
+
+
+
+
+ + ☆ RealVVT: Towards Photorealistic Video Virtual Try-on via Spatio-Temporal + Consistency + + +
+ Virtual try-on has emerged as a pivotal task at the intersection of computer +vision and fashion, aimed at digitally simulating how clothing items fit on the +human body. Despite notable progress in single-image virtual try-on (VTO), +current methodologies often struggle to preserve a consistent and authentic +appearance of clothing across extended video sequences. This challenge arises +from the complexities of capturing dynamic human pose and maintaining target +clothing characteristics. We leverage pre-existing video foundation models to +introduce RealVVT, a photoRealistic Video Virtual Try-on framework tailored to +bolster stability and realism within dynamic video contexts. Our methodology +encompasses a Clothing & Temporal Consistency strategy, an Agnostic-guided +Attention Focus Loss mechanism to ensure spatial consistency, and a Pose-guided +Long Video VTO technique adept at handling extended video sequences.Extensive +experiments across various datasets confirms that our approach outperforms +existing state-of-the-art models in both single-image and video VTO tasks, +offering a viable solution for practical applications within the realms of +fashion e-commerce and virtual fitting environments. + +
+
+ comment: 10 pages (8 pages main text, 2 pages references), 5 figures in the + main text, and 4 pages supplementary materials with 3 additional figures +
+
+
+
+
+ + ☆ FlexiClip: Locality-Preserving Free-Form Character Animation + + +
+ Animating clipart images with seamless motion while maintaining visual +fidelity and temporal coherence presents significant challenges. Existing +methods, such as AniClipart, effectively model spatial deformations but often +fail to ensure smooth temporal transitions, resulting in artifacts like abrupt +motions and geometric distortions. Similarly, text-to-video (T2V) and +image-to-video (I2V) models struggle to handle clipart due to the mismatch in +statistical properties between natural video and clipart styles. This paper +introduces FlexiClip, a novel approach designed to overcome these limitations +by addressing the intertwined challenges of temporal consistency and geometric +integrity. FlexiClip extends traditional B\'ezier curve-based trajectory +modeling with key innovations: temporal Jacobians to correct motion dynamics +incrementally, continuous-time modeling via probability flow ODEs (pfODEs) to +mitigate temporal noise, and a flow matching loss inspired by GFlowNet +principles to optimize smooth motion transitions. These enhancements ensure +coherent animations across complex scenarios involving rapid movements and +non-rigid deformations. Extensive experiments validate the effectiveness of +FlexiClip in generating animations that are not only smooth and natural but +also structurally consistent across diverse clipart types, including humans and +animals. By integrating spatial and temporal modeling with pre-trained video +diffusion models, FlexiClip sets a new standard for high-quality clipart +animation, offering robust performance across a wide range of visual content. +Project Page: https://creative-gen.github.io/flexiclip.github.io/ + +
+
+ comment: 13 pages, 4 figures, 7 tables +
+
+
+
+
+ + GS-LIVO: Real-Time LiDAR, Inertial, and Visual Multi-sensor Fused + Odometry with Gaussian Mapping + + +
+ In recent years, 3D Gaussian splatting (3D-GS) has emerged as a novel scene +representation approach. However, existing vision-only 3D-GS methods often rely +on hand-crafted heuristics for point-cloud densification and face challenges in +handling occlusions and high GPU memory and computation consumption. +LiDAR-Inertial-Visual (LIV) sensor configuration has demonstrated superior +performance in localization and dense mapping by leveraging complementary +sensing characteristics: rich texture information from cameras, precise +geometric measurements from LiDAR, and high-frequency motion data from IMU. +Inspired by this, we propose a novel real-time Gaussian-based simultaneous +localization and mapping (SLAM) system. Our map system comprises a global +Gaussian map and a sliding window of Gaussians, along with an IESKF-based +odometry. The global Gaussian map consists of hash-indexed voxels organized in +a recursive octree, effectively covering sparse spatial volumes while adapting +to different levels of detail and scales. The Gaussian map is initialized +through multi-sensor fusion and optimized with photometric gradients. Our +system incrementally maintains a sliding window of Gaussians, significantly +reducing GPU computation and memory consumption by only optimizing the map +within the sliding window. Moreover, we implement a tightly coupled +multi-sensor fusion odometry with an iterative error state Kalman filter +(IESKF), leveraging real-time updating and rendering of the Gaussian map. Our +system represents the first real-time Gaussian-based SLAM framework deployable +on resource-constrained embedded systems, demonstrated on the NVIDIA Jetson +Orin NX platform. The framework achieves real-time performance while +maintaining robust multi-sensor fusion capabilities. All implementation +algorithms, hardware designs, and CAD models will be publicly available. + +
+
+
+
+
+ + ☆ TimeFlow: Longitudinal Brain Image Registration and Aging Progression + Analysis + + +
+ Predicting future brain states is crucial for understanding healthy aging and +neurodegenerative diseases. Longitudinal brain MRI registration, a cornerstone +for such analyses, has long been limited by its inability to forecast future +developments, reliance on extensive, dense longitudinal data, and the need to +balance registration accuracy with temporal smoothness. In this work, we +present \emph{TimeFlow}, a novel framework for longitudinal brain MRI +registration that overcomes all these challenges. Leveraging a U-Net +architecture with temporal conditioning inspired by diffusion models, TimeFlow +enables accurate longitudinal registration and facilitates prospective analyses +through future image prediction. Unlike traditional methods that depend on +explicit smoothness regularizers and dense sequential data, TimeFlow achieves +temporal consistency and continuity without these constraints. Experimental +results highlight its superior performance in both future timepoint prediction +and registration accuracy compared to state-of-the-art methods. Additionally, +TimeFlow supports novel biological brain aging analyses, effectively +differentiating neurodegenerative conditions from healthy aging. It eliminates +the need for segmentation, thereby avoiding the challenges of non-trivial +annotation and inconsistent segmentation errors. TimeFlow paves the way for +accurate, data-efficient, and annotation-free prospective analyses of brain +aging and chronic diseases. + +
+
+
+
+
+ + ☆ A Survey on Facial Image Privacy Preservation in Cloud-Based Services + + +
+ Facial recognition models are increasingly employed by commercial +enterprises, government agencies, and cloud service providers for identity +verification, consumer services, and surveillance. These models are often +trained using vast amounts of facial data processed and stored in cloud-based +platforms, raising significant privacy concerns. Users' facial images may be +exploited without their consent, leading to potential data breaches and misuse. +This survey presents a comprehensive review of current methods aimed at +preserving facial image privacy in cloud-based services. We categorize these +methods into two primary approaches: image obfuscation-based protection and +adversarial perturbation-based protection. We provide an in-depth analysis of +both categories, offering qualitative and quantitative comparisons of their +effectiveness. Additionally, we highlight unresolved challenges and propose +future research directions to improve privacy preservation in cloud computing +environments. + +
+
+
+
+
+ + ☆ Product of Gaussian Mixture Diffusion Model for non-linear MRI Inversion + + +
+ Diffusion models have recently shown remarkable results in magnetic resonance +imaging reconstruction. However, the employed networks typically are black-box +estimators of the (smoothed) prior score with tens of millions of parameters, +restricting interpretability and increasing reconstruction time. Furthermore, +parallel imaging reconstruction algorithms either rely on off-line coil +sensitivity estimation, which is prone to misalignment and restricting sampling +trajectories, or perform per-coil reconstruction, making the computational cost +proportional to the number of coils. To overcome this, we jointly reconstruct +the image and the coil sensitivities using the lightweight, +parameter-efficient, and interpretable product of Gaussian mixture diffusion +model as an image prior and a classical smoothness priors on the coil +sensitivities. The proposed method delivers promising results while allowing +for fast inference and demonstrating robustness to contrast out-of-distribution +data and sampling trajectories, comparable to classical variational penalties +such as total variation. Finally, the probabilistic formulation allows the +calculation of the posterior expectation and pixel-wise variance. + +
+
+
+
+
+ + ☆ BRIGHT-VO: Brightness-Guided Hybrid Transformer for Visual Odometry with + Multi-modality Refinement Module + + +
+ Visual odometry (VO) plays a crucial role in autonomous driving, robotic +navigation, and other related tasks by estimating the position and orientation +of a camera based on visual input. Significant progress has been made in +data-driven VO methods, particularly those leveraging deep learning techniques +to extract image features and estimate camera poses. However, these methods +often struggle in low-light conditions because of the reduced visibility of +features and the increased difficulty of matching keypoints. To address this +limitation, we introduce BrightVO, a novel VO model based on Transformer +architecture, which not only performs front-end visual feature extraction, but +also incorporates a multi-modality refinement module in the back-end that +integrates Inertial Measurement Unit (IMU) data. Using pose graph optimization, +this module iteratively refines pose estimates to reduce errors and improve +both accuracy and robustness. Furthermore, we create a synthetic low-light +dataset, KiC4R, which includes a variety of lighting conditions to facilitate +the training and evaluation of VO frameworks in challenging environments. +Experimental results demonstrate that BrightVO achieves state-of-the-art +performance on both the KiC4R dataset and the KITTI benchmarks. Specifically, +it provides an average improvement of 20% in pose estimation accuracy in normal +outdoor environments and 259% in low-light conditions, outperforming existing +methods. For widespread use and further development, the research work is fully +open-source at https://github.com/Anastasiawd/BrightVO. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + StereoGen: High-quality Stereo Image Generation from a Single Image + + +
+ State-of-the-art supervised stereo matching methods have achieved amazing +results on various benchmarks. However, these data-driven methods suffer from +generalization to real-world scenarios due to the lack of real-world annotated +data. In this paper, we propose StereoGen, a novel pipeline for high-quality +stereo image generation. This pipeline utilizes arbitrary single images as left +images and pseudo disparities generated by a monocular depth estimation model +to synthesize high-quality corresponding right images. Unlike previous methods +that fill the occluded area in warped right images using random backgrounds or +using convolutions to take nearby pixels selectively, we fine-tune a diffusion +inpainting model to recover the background. Images generated by our model +possess better details and undamaged semantic structures. Besides, we propose +Training-free Confidence Generation and Adaptive Disparity Selection. The +former suppresses the negative effect of harmful pseudo ground truth during +stereo training, while the latter helps generate a wider disparity distribution +and better synthetic images. Experiments show that models trained under our +pipeline achieve state-of-the-art zero-shot generalization results among all +published methods. The code will be available upon publication of the paper. + +
+
+
+
+
+ + ☆ Joint Learning of Depth and Appearance for Portrait Image Animation + + +
+ 2D portrait animation has experienced significant advancements in recent +years. Much research has utilized the prior knowledge embedded in large +generative diffusion models to enhance high-quality image manipulation. +However, most methods only focus on generating RGB images as output, and the +co-generation of consistent visual plus 3D output remains largely +under-explored. In our work, we propose to jointly learn the visual appearance +and depth simultaneously in a diffusion-based portrait image generator. Our +method embraces the end-to-end diffusion paradigm and introduces a new +architecture suitable for learning this conditional joint distribution, +consisting of a reference network and a channel-expanded diffusion backbone. +Once trained, our framework can be efficiently adapted to various downstream +applications, such as facial depth-to-image and image-to-depth generation, +portrait relighting, and audio-driven talking head animation with consistent 3D +output. + +
+
+
+
+
+ + MonSter: Marry Monodepth to Stereo Unleashes Power + + +
+ Stereo matching recovers depth from image correspondences. Existing methods +struggle to handle ill-posed regions with limited matching cues, such as +occlusions and textureless areas. To address this, we propose MonSter, a novel +method that leverages the complementary strengths of monocular depth estimation +and stereo matching. MonSter integrates monocular depth and stereo matching +into a dual-branch architecture to iteratively improve each other. +Confidence-based guidance adaptively selects reliable stereo cues for monodepth +scale-shift recovery. The refined monodepth is in turn guides stereo +effectively at ill-posed regions. Such iterative mutual enhancement enables +MonSter to evolve monodepth priors from coarse object-level structures to +pixel-level geometry, fully unlocking the potential of stereo matching. As +shown in Fig.1, MonSter ranks 1st across five most commonly used leaderboards +-- SceneFlow, KITTI 2012, KITTI 2015, Middlebury, and ETH3D. Achieving up to +49.5% improvements (Bad 1.0 on ETH3D) over the previous best method. +Comprehensive analysis verifies the effectiveness of MonSter in ill-posed +regions. In terms of zero-shot generalization, MonSter significantly and +consistently outperforms state-of-the-art across the board. The code is +publicly available at: https://github.com/Junda24/MonSter. + +
+
+
+
+
+ + ☆ Detecting Wildfire Flame and Smoke through Edge Computing using Transfer + Learning Enhanced Deep Learning Models + + +
+ Autonomous unmanned aerial vehicles (UAVs) integrated with edge computing +capabilities empower real-time data processing directly on the device, +dramatically reducing latency in critical scenarios such as wildfire detection. +This study underscores Transfer Learning's (TL) significance in boosting the +performance of object detectors for identifying wildfire smoke and flames, +especially when trained on limited datasets, and investigates the impact TL has +on edge computing metrics. With the latter focusing how TL-enhanced You Only +Look Once (YOLO) models perform in terms of inference time, power usage, and +energy consumption when using edge computing devices. This study utilizes the +Aerial Fire and Smoke Essential (AFSE) dataset as the target, with the Flame +and Smoke Detection Dataset (FASDD) and the Microsoft Common Objects in Context +(COCO) dataset serving as source datasets. We explore a two-stage cascaded TL +method, utilizing D-Fire or FASDD as initial stage target datasets and AFSE as +the subsequent stage. Through fine-tuning, TL significantly enhances detection +precision, achieving up to 79.2% mean Average Precision (mAP@0.5), reduces +training time, and increases model generalizability across the AFSE dataset. +However, cascaded TL yielded no notable improvements and TL alone did not +benefit the edge computing metrics evaluated. Lastly, this work found that +YOLOv5n remains a powerful model when lacking hardware acceleration, finding +that YOLOv5n can process images nearly twice as fast as its newer counterpart, +YOLO11n. Overall, the results affirm TL's role in augmenting the accuracy of +object detectors while also illustrating that additional enhancements are +needed to improve edge computing performance. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ Self-Organizing Edge Computing Distribution Framework for Visual SLAM + + +
+ Localization within a known environment is a crucial capability for mobile +robots. Simultaneous Localization and Mapping (SLAM) is a prominent solution to +this problem. SLAM is a framework that consists of a diverse set of +computational tasks ranging from real-time tracking to computation-intensive +map optimization. This combination can present a challenge for resource-limited +mobile robots. Previously, edge-assisted SLAM methods have demonstrated +promising real-time execution capabilities by offloading heavy computations +while performing real-time tracking onboard. However, the common approach of +utilizing a client-server architecture for offloading is sensitive to server +and network failures. In this article, we propose a novel edge-assisted SLAM +framework capable of self-organizing fully distributed SLAM execution across a +network of devices or functioning on a single device without connectivity. The +architecture consists of three layers and is designed to be device-agnostic, +resilient to network failures, and minimally invasive to the core SLAM system. +We have implemented and demonstrated the framework for monocular ORB SLAM3 and +evaluated it in both fully distributed and standalone SLAM configurations +against the ORB SLAM3. The experiment results demonstrate that the proposed +design matches the accuracy and resource utilization of the monolithic approach +while enabling collaborative execution. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ Computerized Assessment of Motor Imitation for Distinguishing Autism in + Video (CAMI-2DNet) + + +
+ Motor imitation impairments are commonly reported in individuals with autism +spectrum conditions (ASCs), suggesting that motor imitation could be used as a +phenotype for addressing autism heterogeneity. Traditional methods for +assessing motor imitation are subjective, labor-intensive, and require +extensive human training. Modern Computerized Assessment of Motor Imitation +(CAMI) methods, such as CAMI-3D for motion capture data and CAMI-2D for video +data, are less subjective. However, they rely on labor-intensive data +normalization and cleaning techniques, and human annotations for algorithm +training. To address these challenges, we propose CAMI-2DNet, a scalable and +interpretable deep learning-based approach to motor imitation assessment in +video data, which eliminates the need for data normalization, cleaning and +annotation. CAMI-2DNet uses an encoder-decoder architecture to map a video to a +motion encoding that is disentangled from nuisance factors such as body shape +and camera views. To learn a disentangled representation, we employ synthetic +data generated by motion retargeting of virtual characters through the +reshuffling of motion, body shape, and camera views, as well as real +participant data. To automatically assess how well an individual imitates an +actor, we compute a similarity score between their motion encodings, and use it +to discriminate individuals with ASCs from neurotypical (NT) individuals. Our +comparative analysis demonstrates that CAMI-2DNet has a strong correlation with +human scores while outperforming CAMI-2D in discriminating ASC vs NT children. +Moreover, CAMI-2DNet performs comparably to CAMI-3D while offering greater +practicality by operating directly on video data and without the need for +ad-hoc data normalization and human annotations. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ PACF: Prototype Augmented Compact Features for Improving Domain Adaptive + Object Detection + + +
+ In recent years, there has been significant advancement in object detection. +However, applying off-the-shelf detectors to a new domain leads to significant +performance drop, caused by the domain gap. These detectors exhibit +higher-variance class-conditional distributions in the target domain than that +in the source domain, along with mean shift. To address this problem, we +propose the Prototype Augmented Compact Features (PACF) framework to regularize +the distribution of intra-class features. Specifically, we provide an in-depth +theoretical analysis on the lower bound of the target features-related +likelihood and derive the prototype cross entropy loss to further calibrate the +distribution of target RoI features. Furthermore, a mutual regularization +strategy is designed to enable the linear and prototype-based classifiers to +learn from each other, promoting feature compactness while enhancing +discriminability. Thanks to this PACF framework, we have obtained a more +compact cross-domain feature space, within which the variance of the target +features' class-conditional distributions has significantly decreased, and the +class-mean shift between the two domains has also been further reduced. The +results on different adaptation settings are state-of-the-art, which +demonstrate the board applicability and effectiveness of the proposed approach. + +
+
+
+
+
+ + ☆ Watermarking in Diffusion Model: Gaussian Shading with Exact Diffusion + Inversion via Coupled Transformations (EDICT) + + +
+ This paper introduces a novel approach to enhance the performance of Gaussian +Shading, a prevalent watermarking technique, by integrating the Exact Diffusion +Inversion via Coupled Transformations (EDICT) framework. While Gaussian Shading +traditionally embeds watermarks in a noise latent space, followed by iterative +denoising for image generation and noise addition for watermark recovery, its +inversion process is not exact, leading to potential watermark distortion. We +propose to leverage EDICT's ability to derive exact inverse mappings to refine +this process. Our method involves duplicating the watermark-infused noisy +latent and employing a reciprocal, alternating denoising and noising scheme +between the two latents, facilitated by EDICT. This allows for a more precise +reconstruction of both the image and the embedded watermark. Empirical +evaluation on standard datasets demonstrates that our integrated approach +yields a slight, yet statistically significant improvement in watermark +recovery fidelity. These results highlight the potential of EDICT to enhance +existing diffusion-based watermarking techniques by providing a more accurate +and robust inversion mechanism. To the best of our knowledge, this is the first +work to explore the synergy between EDICT and Gaussian Shading for digital +watermarking, opening new avenues for research in robust and high-fidelity +watermark embedding and extraction. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Image-to-Force Estimation for Soft Tissue Interaction in + Robotic-Assisted Surgery Using Structured Light + + +
+ For Minimally Invasive Surgical (MIS) robots, accurate haptic interaction +force feedback is essential for ensuring the safety of interacting with soft +tissue. However, most existing MIS robotic systems cannot facilitate direct +measurement of the interaction force with hardware sensors due to space +limitations. This letter introduces an effective vision-based scheme that +utilizes a One-Shot structured light projection with a designed pattern on soft +tissue coupled with haptic information processing through a trained +image-to-force neural network. The images captured from the endoscopic stereo +camera are analyzed to reconstruct high-resolution 3D point clouds for soft +tissue deformation. Based on this, a modified PointNet-based force estimation +method is proposed, which excels in representing the complex mechanical +properties of soft tissue. Numerical force interaction experiments are +conducted on three silicon materials with different stiffness. The results +validate the effectiveness of the proposed scheme. + +
+
+
+
+
+ + ☆ Densely Connected Parameter-Efficient Tuning for Referring Image + Segmentation + + +
+ In the domain of computer vision, Parameter-Efficient Tuning (PET) is +increasingly replacing the traditional paradigm of pre-training followed by +full fine-tuning. PET is particularly favored for its effectiveness in large +foundation models, as it streamlines transfer learning costs and optimizes +hardware utilization. However, the current PET methods are mainly designed for +single-modal optimization. While some pioneering studies have undertaken +preliminary explorations, they still remain at the level of aligned encoders +(e.g., CLIP) and lack exploration of misaligned encoders. These methods show +sub-optimal performance with misaligned encoders, as they fail to effectively +align the multimodal features during fine-tuning. In this paper, we introduce +DETRIS, a parameter-efficient tuning framework designed to enhance low-rank +visual feature propagation by establishing dense interconnections between each +layer and all preceding layers, which enables effective cross-modal feature +interaction and adaptation to misaligned encoders. We also suggest using text +adapters to improve textual features. Our simple yet efficient approach greatly +surpasses state-of-the-art methods with 0.9% to 1.8% backbone parameter +updates, evaluated on challenging benchmarks. Our project is available at +\url{https://github.com/jiaqihuang01/DETRIS}. + +
+
+ comment: Accepted by AAAI2025 +
+
+
+
+
+ + ☆ Scalable and High-Quality Neural Implicit Representation for 3D + Reconstruction + + +
+ Various SDF-based neural implicit surface reconstruction methods have been +proposed recently, and have demonstrated remarkable modeling capabilities. +However, due to the global nature and limited representation ability of a +single network, existing methods still suffer from many drawbacks, such as +limited accuracy and scale of the reconstruction. In this paper, we propose a +versatile, scalable and high-quality neural implicit representation to address +these issues. We integrate a divide-and-conquer approach into the neural +SDF-based reconstruction. Specifically, we model the object or scene as a +fusion of multiple independent local neural SDFs with overlapping regions. The +construction of our representation involves three key steps: (1) constructing +the distribution and overlap relationship of the local radiance fields based on +object structure or data distribution, (2) relative pose registration for +adjacent local SDFs, and (3) SDF blending. Thanks to the independent +representation of each local region, our approach can not only achieve +high-fidelity surface reconstruction, but also enable scalable scene +reconstruction. Extensive experimental results demonstrate the effectiveness +and practicality of our proposed method. + +
+
+
+
+
+ + ☆ GOTLoc: General Outdoor Text-based Localization Using Scene Graph + Retrieval with OpenStreetMap + + +
+ We propose GOTLoc, a robust localization method capable of operating even in +outdoor environments where GPS signals are unavailable. The method achieves +this robust localization by leveraging comparisons between scene graphs +generated from text descriptions and maps. Existing text-based localization +studies typically represent maps as point clouds and identify the most similar +scenes by comparing embeddings of text and point cloud data. However, point +cloud maps have limited scalability as it is impractical to pre-generate maps +for all outdoor spaces. Furthermore, their large data size makes it challenging +to store and utilize them directly on actual robots. To address these issues, +GOTLoc leverages compact data structures, such as scene graphs, to store +spatial information, enabling individual robots to carry and utilize large +amounts of map data. Additionally, by utilizing publicly available map data, +such as OpenStreetMap, which provides global information on outdoor spaces, we +eliminate the need for additional effort to create custom map data. For +performance evaluation, we utilized the KITTI360Pose dataset in conjunction +with corresponding OpenStreetMap data to compare the proposed method with +existing approaches. Our results demonstrate that the proposed method achieves +accuracy comparable to algorithms relying on point cloud maps. Moreover, in +city-scale tests, GOTLoc required significantly less storage compared to point +cloud-based methods and completed overall processing within a few seconds, +validating its applicability to real-world robotics. Our code is available at +https://github.com/donghwijung/GOTLoc. + +
+
+
+
+
+ + ☆ MIAFEx: An Attention-based Feature Extraction Method for Medical Image + Classification + + +
+ Feature extraction techniques are crucial in medical image classification; +however, classical feature extractors in addition to traditional machine +learning classifiers often exhibit significant limitations in providing +sufficient discriminative information for complex image sets. While +Convolutional Neural Networks (CNNs) and Vision Transformer (ViT) have shown +promise in feature extraction, they are prone to overfitting due to the +inherent characteristics of medical imaging data, including small sample sizes +or high intra-class variance. In this work, the Medical Image Attention-based +Feature Extractor (MIAFEx) is proposed, a novel method that employs a learnable +refinement mechanism to enhance the classification token within the Transformer +encoder architecture. This mechanism adjusts the token based on learned +weights, improving the extraction of salient features and enhancing the model's +adaptability to the challenges presented by medical imaging data. The MIAFEx +output features quality is compared against classical feature extractors using +traditional and hybrid classifiers. Also, the performance of these features is +compared against modern CNN and ViT models in classification tasks, +demonstrating its superiority in accuracy and robustness across multiple +complex classification medical imaging datasets. This advantage is particularly +pronounced in scenarios with limited training data, where traditional and +modern models often struggle to generalize effectively. The source code of this +proposal can be found at +https://github.com/Oscar-RamosS/Medical-Image-Attention-based-Feature-Extractor-MIAFEx + +
+
+ comment: In preparation for Journal Submission +
+
+
+
+
+ + ☆ DynamicFace: High-Quality and Consistent Video Face Swapping using + Composable 3D Facial Priors + + +
+ Face swapping transfers the identity of a source face to a target face while +retaining the attributes like expression, pose, hair, and background of the +target face. Advanced face swapping methods have achieved attractive results. +However, these methods often inadvertently transfer identity information from +the target face, compromising expression-related details and accurate identity. +We propose a novel method DynamicFace that leverages the power of diffusion +model and plug-and-play temporal layers for video face swapping. First, we +introduce four fine-grained face conditions using 3D facial priors. All +conditions are designed to be disentangled from each other for precise and +unique control. Then, we adopt Face Former and ReferenceNet for high-level and +detailed identity injection. Through experiments on the FF++ dataset, we +demonstrate that our method achieves state-of-the-art results in face swapping, +showcasing superior image quality, identity preservation, and expression +accuracy. Besides, our method could be easily transferred to video domain with +temporal attention layer. Our code and results will be available on the project +page: https://dynamic-face.github.io/ + +
+
+
+
+
+ + ☆ The Devil is in Temporal Token: High Quality Video Reasoning + Segmentation + + +
+ Existing methods for Video Reasoning Segmentation rely heavily on a single +special token to represent the object in the keyframe or the entire video, +inadequately capturing spatial complexity and inter-frame motion. To overcome +these challenges, we propose VRS-HQ, an end-to-end video reasoning segmentation +approach that leverages Multimodal Large Language Models (MLLMs) to inject rich +spatiotemporal features into hierarchical tokens.Our key innovations include a +Temporal Dynamic Aggregation (TDA) and a Token-driven Keyframe Selection (TKS). +Specifically, we design frame-level and temporal-level tokens that +utilize MLLM's autoregressive learning to effectively capture both local and +global information. Subsequently, we apply a similarity-based weighted fusion +and frame selection strategy, then utilize SAM2 to perform keyframe +segmentation and propagation. To enhance keyframe localization accuracy, the +TKS filters keyframes based on SAM2's occlusion scores during inference. VRS-HQ +achieves state-of-the-art performance on ReVOS, surpassing VISA by +5.9%/12.5%/9.1% in J&F scores across the three subsets. These results highlight +the strong temporal reasoning and segmentation capabilities of our method. Code +and model weights will be released at VRS-HQ. + +
+
+
+
+
+ + ☆ Comprehensive Subjective and Objective Evaluation Method for + Text-generated Video + + +
+ Recent text-to-video (T2V) technology advancements, as demonstrated by models +such as Gen3, Pika, and Sora, have significantly broadened its applicability +and popularity. This progress has created a growing demand for accurate quality +assessment metrics to evaluate the perceptual quality of text-generated videos +and optimize video generation models. However, assessing the quality of +text-generated videos remains challenging due to the presence of highly complex +distortions, such as unnatural actions and phenomena that defy human cognition. +To address these challenges, we constructed a large-scale benchmark dataset for +\textbf{T}ext-generated \textbf{V}ideo \textbf{eval}uation, +\textbf{T2VEval-Bench}, comprising 148 textual words and 1,783 videos generated +by 12 models. During the subjective evaluation, we collected five key scores: +overall impression, video quality, aesthetic quality, realness, and text-video +consistency. For objective evaluation, we developed the \textbf{T2VEval} model, +which assesses videos across three branches: quality, authenticity, and +consistency. Using an attention-based fusion module, T2VEval effectively +integrates features from each branch and predicts scores with the aid of a +large oracle model. Additionally, we implemented a progressive training +strategy, enabling each branch to learn targeted knowledge while maintaining +synergy with the others. Experimental results demonstrate that T2VEval achieves +state-of-the-art performance across multiple metrics. The dataset and code will +be open-sourced upon completion of the follow-up work. + +
+
+
+
+
+ + ☆ Multimodal Fake News Video Explanation Generation + + +
+ Multi-modal explanation involves the assessment of the veracity of a variety +of different content, and relies on multiple information modalities to +comprehensively consider the relevance and consistency between modalities. Most +existing fake news video detection methods focus on improving accuracy while +ignoring the importance of providing explanations. In this paper, we propose a +novel problem - Fake News Video Explanation (FNVE) - Given a multimodal news +containing both video and caption text, we aim to generate natural language +explanations to reveal the truth of predictions. To this end, we develop +FakeNVE, a new dataset of explanations for truthfully multimodal posts, where +each explanation is a natural language (English) sentence describing the +attribution of a news thread. We benchmark FakeNVE by using a multimodal +transformer-based architecture. Subsequently, a BART-based autoregressive +decoder is used as the generator. Empirical results show compelling results for +various baselines (applicable to FNVE) across multiple evaluation metrics. We +also perform human evaluation on explanation generation, achieving high scores +for both adequacy and fluency. + +
+
+
+
+
+ + ☆ Exploring the Efficacy of Meta-Learning: Unveiling Superior Data + Diversity Utilization of MAML Over Pre-training + + +
+ Currently, data and model size dominate the narrative in the training of +super-large, powerful models. However, there has been a lack of exploration on +the effect of other attributes of the training dataset on model performance. We +hypothesize that dataset diversity can impact the performance of vision models. +Our study shows positive correlations between test set accuracy and data +diversity, providing an argument for furthering the research of dataset +attributes beyond size. We analyzed pre-training and model-agnostic +meta-learning methods on twelve popular visual datasets (e.g., Omniglot, +CIFAR-FS, Aircraft) and five model configurations, including MAML variants with +different numbers of inner gradient steps and supervised learning. We show +moderate to strong positive correlations (R-squared: 0.15-0.42) between +accuracy and data diversity and weaker but significant correlations (R-squared: +~0.2) between loss and diversity. These findings support our hypothesis and +demonstrate a promising way for a deeper exploration of how formal data +diversity influences model performance. This initial study highlights the +potential of (Task2Vec) data diversity as a valuable measure in the rapidly +evolving field of large-scale learning and emphasizes that understanding the +dataset is key to building more powerful and generalizable models. + +
+
+
+
+
+ + ☆ Yuan: Yielding Unblemished Aesthetics Through A Unified Network for + Visual Imperfections Removal in Generated Images + + +
+ Generative AI presents transformative potential across various domains, from +creative arts to scientific visualization. However, the utility of AI-generated +imagery is often compromised by visual flaws, including anatomical +inaccuracies, improper object placements, and misplaced textual elements. These +imperfections pose significant challenges for practical applications. To +overcome these limitations, we introduce \textit{Yuan}, a novel framework that +autonomously corrects visual imperfections in text-to-image synthesis. +\textit{Yuan} uniquely conditions on both the textual prompt and the segmented +image, generating precise masks that identify areas in need of refinement +without requiring manual intervention -- a common constraint in previous +methodologies. Following the automated masking process, an advanced inpainting +module seamlessly integrates contextually coherent content into the identified +regions, preserving the integrity and fidelity of the original image and +associated text prompts. Through extensive experimentation on publicly +available datasets such as ImageNet100 and Stanford Dogs, along with a +custom-generated dataset, \textit{Yuan} demonstrated superior performance in +eliminating visual imperfections. Our approach consistently achieved higher +scores in quantitative metrics, including NIQE, BRISQUE, and PI, alongside +favorable qualitative evaluations. These results underscore \textit{Yuan}'s +potential to significantly enhance the quality and applicability of +AI-generated images across diverse fields. + +
+
+
+
+
+ + ☆ SuperSAM: Crafting a SAM Supernetwork via Structured Pruning and + Unstructured Parameter Prioritization + + +
+ Neural Architecture Search (NAS) is a powerful approach of automating the +design of efficient neural architectures. In contrast to traditional NAS +methods, recently proposed one-shot NAS methods prove to be more efficient in +performing NAS. One-shot NAS works by generating a singular weight-sharing +supernetwork that acts as a search space (container) of subnetworks. Despite +its achievements, designing the one-shot search space remains a major +challenge. In this work we propose a search space design strategy for Vision +Transformer (ViT)-based architectures. In particular, we convert the Segment +Anything Model (SAM) into a weight-sharing supernetwork called SuperSAM. Our +approach involves automating the search space design via layer-wise structured +pruning and parameter prioritization. While the structured pruning applies +probabilistic removal of certain transformer layers, parameter prioritization +performs weight reordering and slicing of MLP-blocks in the remaining layers. +We train supernetworks on several datasets using the sandwich rule. For +deployment, we enhance subnetwork discovery by utilizing a program autotuner to +identify efficient subnetworks within the search space. The resulting +subnetworks are 30-70% smaller in size compared to the original pre-trained SAM +ViT-B, yet outperform the pretrained model. Our work introduces a new and +effective method for ViT NAS search-space design. + +
+
+
+
+
+ + ☆ Unified Few-shot Crack Segmentation and its Precise 3D Automatic + Measurement in Concrete Structures + + +
+ Visual-Spatial Systems has become increasingly essential in concrete crack +inspection. However, existing methods often lacks adaptability to diverse +scenarios, exhibits limited robustness in image-based approaches, and struggles +with curved or complex geometries. To address these limitations, an innovative +framework for two-dimensional (2D) crack detection, three-dimensional (3D) +reconstruction, and 3D automatic crack measurement was proposed by integrating +computer vision technologies and multi-modal Simultaneous localization and +mapping (SLAM) in this study. Firstly, building on a base DeepLabv3+ +segmentation model, and incorporating specific refinements utilizing foundation +model Segment Anything Model (SAM), we developed a crack segmentation method +with strong generalization across unfamiliar scenarios, enabling the generation +of precise 2D crack masks. To enhance the accuracy and robustness of 3D +reconstruction, Light Detection and Ranging (LiDAR) point clouds were utilized +together with image data and segmentation masks. By leveraging both image- and +LiDAR-SLAM, we developed a multi-frame and multi-modal fusion framework that +produces dense, colorized point clouds, effectively capturing crack semantics +at a 3D real-world scale. Furthermore, the crack geometric attributions were +measured automatically and directly within 3D dense point cloud space, +surpassing the limitations of conventional 2D image-based measurements. This +advancement makes the method suitable for structural components with curved and +complex 3D geometries. Experimental results across various concrete structures +highlight the significant improvements and unique advantages of the proposed +method, demonstrating its effectiveness, accuracy, and robustness in real-world +applications. + +
+
+
+
+
+ + ☆ Grounding Text-To-Image Diffusion Models For Controlled High-Quality + Image Generation + + +
+ Large-scale text-to-image (T2I) diffusion models have demonstrated an +outstanding performance in synthesizing diverse high-quality visuals from +natural language text captions. Multiple layout-to-image models have been +developed to control the generation process by utilizing a broad array of +layouts such as segmentation maps, edges, and human keypoints. In this work, we +present ObjectDiffusion, a model that takes inspirations from the top +cutting-edge image generative frameworks to seamlessly condition T2I models +with new bounding boxes capabilities. Specifically, we make substantial +modifications to the network architecture introduced in ContorlNet to integrate +it with the condition processing and injection techniques proposed in GLIGEN. +ObjectDiffusion is initialized with pretraining parameters to leverage the +generation knowledge obtained from training on large-scale datasets. We +fine-tune ObjectDiffusion on the COCO2017 training dataset and evaluate it on +the COCO2017 validation dataset. Our model achieves an AP$_{50}$ of 46.6, an AR +of 44.5, and a FID of 19.8 outperforming the current SOTA model trained on +open-source datasets in all of the three metrics. ObjectDiffusion demonstrates +a distinctive capability in synthesizing diverse, high-quality, high-fidelity +images that seamlessly conform to the semantic and spatial control layout. +Evaluated in qualitative and quantitative tests, ObjectDiffusion exhibits +remarkable grounding abilities on closed-set and open-set settings across a +wide variety of contexts. The qualitative assessment verifies the ability of +ObjectDiffusion to generate multiple objects of different sizes and locations. + +
+
+
+
+
+ + ☆ Patch-aware Vector Quantized Codebook Learning for Unsupervised Visual + Defect Detection + + +
+ Unsupervised visual defect detection is critical in industrial applications, +requiring a representation space that captures normal data features while +detecting deviations. Achieving a balance between expressiveness and +compactness is challenging; an overly expressive space risks inefficiency and +mode collapse, impairing detection accuracy. We propose a novel approach using +an enhanced VQ-VAE framework optimized for unsupervised defect detection. Our +model introduces a patch-aware dynamic code assignment scheme, enabling +context-sensitive code allocation to optimize spatial representation. This +strategy enhances normal-defect distinction and improves detection accuracy +during inference. Experiments on MVTecAD, BTAD, and MTSD datasets show our +method achieves state-of-the-art performance. + +
+
+ comment: 7 pages, Accepted to 36th IEEE ICTAI 2024 +
+
+
+
+
+ + ☆ Cancer-Net PCa-Seg: Benchmarking Deep Learning Models for Prostate + Cancer Segmentation Using Synthetic Correlated Diffusion Imaging + + +
+ Prostate cancer (PCa) is the most prevalent cancer among men in the United +States, accounting for nearly 300,000 cases, 29% of all diagnoses and 35,000 +total deaths in 2024. Traditional screening methods such as prostate-specific +antigen (PSA) testing and magnetic resonance imaging (MRI) have been pivotal in +diagnosis, but have faced limitations in specificity and generalizability. In +this paper, we explore the potential of enhancing PCa lesion segmentation using +a novel MRI modality called synthetic correlated diffusion imaging (CDI$^s$). +We employ several state-of-the-art deep learning models, including U-Net, +SegResNet, Swin UNETR, Attention U-Net, and LightM-UNet, to segment PCa lesions +from a 200 CDI$^s$ patient cohort. We find that SegResNet achieved superior +segmentation performance with a Dice-Sorensen coefficient (DSC) of $76.68 \pm +0.8$. Notably, the Attention U-Net, while slightly less accurate (DSC $74.82 +\pm 2.0$), offered a favorable balance between accuracy and computational +efficiency. Our findings demonstrate the potential of deep learning models in +improving PCa lesion segmentation using CDI$^s$ to enhance PCa management and +clinical support. + +
+
+ comment: 8 pages, 2 figures, to be published in Studies in Computational + Intelligence. This paper introduces Cancer-Net PCa-Seg, a comprehensive + evaluation of deep learning models for prostate cancer segmentation using + synthetic correlated diffusion imaging (CDI$^s$). We benchmark five + state-of-the-art architectures: U-Net, SegResNet, Swin UNETR, Attention + U-Net, and LightM-UNet +
+
+
+
+
+ + ☆ Embodied Scene Understanding for Vision Language Models via MetaVQA + + +
+ Vision Language Models (VLMs) demonstrate significant potential as embodied +AI agents for various mobility applications. However, a standardized, +closed-loop benchmark for evaluating their spatial reasoning and sequential +decision-making capabilities is lacking. To address this, we present MetaVQA: a +comprehensive benchmark designed to assess and enhance VLMs' understanding of +spatial relationships and scene dynamics through Visual Question Answering +(VQA) and closed-loop simulations. MetaVQA leverages Set-of-Mark prompting and +top-down view ground-truth annotations from nuScenes and Waymo datasets to +automatically generate extensive question-answer pairs based on diverse +real-world traffic scenarios, ensuring object-centric and context-rich +instructions. Our experiments show that fine-tuning VLMs with the MetaVQA +dataset significantly improves their spatial reasoning and embodied scene +comprehension in safety-critical simulations, evident not only in improved VQA +accuracies but also in emerging safety-aware driving maneuvers. In addition, +the learning demonstrates strong transferability from simulation to real-world +observation. Code and data will be publicly available at +https://metadriverse.github.io/metavqa . + +
+
+ comment: for the project webpage, see https://metadriverse.github.io/metavqa +
+
+
+
+
+ + ☆ A Vessel Bifurcation Landmark Pair Dataset for Abdominal CT Deformable + Image Registration (DIR) Validation + + +
+ Deformable image registration (DIR) is an enabling technology in many +diagnostic and therapeutic tasks. Despite this, DIR algorithms have limited +clinical use, largely due to a lack of benchmark datasets for quality assurance +during development. To support future algorithm development, here we introduce +our first-of-its-kind abdominal CT DIR benchmark dataset, comprising large +numbers of highly accurate landmark pairs on matching blood vessel +bifurcations. Abdominal CT image pairs of 30 patients were acquired from +several public repositories as well as the authors' institution with IRB +approval. The two CTs of each pair were originally acquired for the same +patient on different days. An image processing workflow was developed and +applied to each image pair: 1) Abdominal organs were segmented with a deep +learning model, and image intensity within organ masks was overwritten. 2) +Matching image patches were manually identified between two CTs of each image +pair 3) Vessel bifurcation landmarks were labeled on one image of each image +patch pair. 4) Image patches were deformably registered, and landmarks were +projected onto the second image. 5) Landmark pair locations were refined +manually or with an automated process. This workflow resulted in 1895 total +landmark pairs, or 63 per case on average. Estimates of the landmark pair +accuracy using digital phantoms were 0.7+/-1.2mm. The data is published in +Zenodo at https://doi.org/10.5281/zenodo.14362785. Instructions for use can be +found at https://github.com/deshanyang/Abdominal-DIR-QA. This dataset is a +first-of-its-kind for abdominal DIR validation. The number, accuracy, and +distribution of landmark pairs will allow for robust validation of DIR +algorithms with precision beyond what is currently available. + +
+
+ comment: 19 pages, 3 figures +
+
+
+
+
+ + ☆ VCRScore: Image captioning metric based on V\&L Transformers, CLIP, and + precision-recall + + +
+ Image captioning has become an essential Vision & Language research task. It +is about predicting the most accurate caption given a specific image or video. +The research community has achieved impressive results by continuously +proposing new models and approaches to improve the overall model's performance. +Nevertheless, despite increasing proposals, the performance metrics used to +measure their advances have remained practically untouched through the years. A +probe of that, nowadays metrics like BLEU, METEOR, CIDEr, and ROUGE are still +very used, aside from more sophisticated metrics such as BertScore and +ClipScore. + Hence, it is essential to adjust how are measure the advances, limitations, +and scopes of the new image captioning proposals, as well as to adapt new +metrics to these new advanced image captioning approaches. + This work proposes a new evaluation metric for the image captioning problem. +To do that, first, it was generated a human-labeled dataset to assess to which +degree the captions correlate with the image's content. Taking these human +scores as ground truth, we propose a new metric, and compare it with several +well-known metrics, from classical to newer ones. Outperformed results were +also found, and interesting insights were presented and discussed. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ☆ Few-Shot Adaptation of Training-Free Foundation Model for 3D Medical + Image Segmentation + + +
+ Vision foundation models have achieved remarkable progress across various +image analysis tasks. In the image segmentation task, foundation models like +the Segment Anything Model (SAM) enable generalizable zero-shot segmentation +through user-provided prompts. However, SAM primarily trained on natural +images, lacks the domain-specific expertise of medical imaging. This limitation +poses challenges when applying SAM to medical image segmentation, including the +need for extensive fine-tuning on specialized medical datasets and a dependency +on manual prompts, which are both labor-intensive and require intervention from +medical experts. + This work introduces the Few-shot Adaptation of Training-frEe SAM (FATE-SAM), +a novel method designed to adapt the advanced Segment Anything Model 2 (SAM2) +for 3D medical image segmentation. FATE-SAM reassembles pre-trained modules of +SAM2 to enable few-shot adaptation, leveraging a small number of support +examples to capture anatomical knowledge and perform prompt-free segmentation, +without requiring model fine-tuning. To handle the volumetric nature of medical +images, we incorporate a Volumetric Consistency mechanism that enhances spatial +coherence across 3D slices. We evaluate FATE-SAM on multiple medical imaging +datasets and compare it with supervised learning methods, zero-shot SAM +approaches, and fine-tuned medical SAM methods. Results show that FATE-SAM +delivers robust and accurate segmentation while eliminating the need for large +annotated datasets and expert intervention. FATE-SAM provides a practical, +efficient solution for medical image segmentation, making it more accessible +for clinical applications. + +
+
+
+
+
+ + ☆ Benchmarking Robustness of Contrastive Learning Models for Medical + Image-Report Retrieval + + +
+ Medical images and reports offer invaluable insights into patient health. The +heterogeneity and complexity of these data hinder effective analysis. To bridge +this gap, we investigate contrastive learning models for cross-domain +retrieval, which associates medical images with their corresponding clinical +reports. This study benchmarks the robustness of four state-of-the-art +contrastive learning models: CLIP, CXR-RePaiR, MedCLIP, and CXR-CLIP. We +introduce an occlusion retrieval task to evaluate model performance under +varying levels of image corruption. Our findings reveal that all evaluated +models are highly sensitive to out-of-distribution data, as evidenced by the +proportional decrease in performance with increasing occlusion levels. While +MedCLIP exhibits slightly more robustness, its overall performance remains +significantly behind CXR-CLIP and CXR-RePaiR. CLIP, trained on a +general-purpose dataset, struggles with medical image-report retrieval, +highlighting the importance of domain-specific training data. The evaluation of +this work suggests that more effort needs to be spent on improving the +robustness of these models. By addressing these limitations, we can develop +more reliable cross-domain retrieval models for medical applications. + +
+
+ comment: This work is accepted to AAAI 2025 Workshop -- the 9th International + Workshop on Health Intelligence +
+
+
+
+
+ + ☆ Deep Self-Supervised Disturbance Mapping with the OPERA Sentinel-1 + Radiometric Terrain Corrected SAR Backscatter Product + + +
+ Mapping land surface disturbances supports disaster response, resource and +ecosystem management, and climate adaptation efforts. Synthetic aperture radar +(SAR) is an invaluable tool for disturbance mapping, providing consistent +time-series images of the ground regardless of weather or illumination +conditions. Despite SAR's potential for disturbance mapping, processing SAR +data to an analysis-ready format requires expertise and significant compute +resources, particularly for large-scale global analysis. In October 2023, +NASA's Observational Products for End-Users from Remote Sensing Analysis +(OPERA) project released the near-global Radiometric Terrain Corrected SAR +backscatter from Sentinel-1 (RTC-S1) dataset, providing publicly available, +analysis-ready SAR imagery. In this work, we utilize this new dataset to +systematically analyze land surface disturbances. As labeling SAR data is often +prohibitively time-consuming, we train a self-supervised vision transformer - +which requires no labels to train - on OPERA RTC-S1 data to estimate a +per-pixel distribution from the set of baseline imagery and assess disturbances +when there is significant deviation from the modeled distribution. To test our +model's capability and generality, we evaluate three different natural +disasters - which represent high-intensity, abrupt disturbances - from three +different regions of the world. Across events, our approach yields high quality +delineations: F1 scores exceeding 0.6 and Areas Under the Precision-Recall +Curve exceeding 0.65, consistently outperforming existing SAR disturbance +methods. Our findings suggest that a self-supervised vision transformer is +well-suited for global disturbance mapping and can be a valuable tool for +operational, near-global disturbance monitoring, particularly when labeled data +does not exist. + +
+
+ comment: 19 pages, 18 figures, 5 tables. Preprint. Submitted to JSTARS +
+
+
+
+
+ + ☆ Deep Distance Map Regression Network with Shape-aware Loss for + Imbalanced Medical Image Segmentation + + +
+ Small object segmentation, like tumor segmentation, is a difficult and +critical task in the field of medical image analysis. Although deep learning +based methods have achieved promising performance, they are restricted to the +use of binary segmentation mask. Inspired by the rigorous mapping between +binary segmentation mask and distance map, we adopt distance map as a novel +ground truth and employ a network to fulfill the computation of distance map. +Specially, we propose a new segmentation framework that incorporates the +existing binary segmentation network and a light weight regression network +(dubbed as LR-Net). Thus, the LR-Net can convert the distance map computation +into a regression task and leverage the rich information of distance maps. +Additionally, we derive a shape-aware loss by employing distance maps as +penalty map to infer the complete shape of an object. We evaluated our approach +on MICCAI 2017 Liver Tumor Segmentation (LiTS) Challenge dataset and a clinical +dataset. Experimental results show that our approach outperforms the +classification-based methods as well as other existing state-of-the-arts. + +
+
+ comment: Conference +
+
+
+
+
+ + ☆ Generative Medical Image Anonymization Based on Latent Code Projection + and Optimization + + +
+ Medical image anonymization aims to protect patient privacy by removing +identifying information, while preserving the data utility to solve downstream +tasks. In this paper, we address the medical image anonymization problem with a +two-stage solution: latent code projection and optimization. In the projection +stage, we design a streamlined encoder to project input images into a latent +space and propose a co-training scheme to enhance the projection process. In +the optimization stage, we refine the latent code using two deep loss functions +designed to address the trade-off between identity protection and data utility +dedicated to medical images. Through a comprehensive set of qualitative and +quantitative experiments, we showcase the effectiveness of our approach on the +MIMIC-CXR chest X-ray dataset by generating anonymized synthetic images that +can serve as training set for detecting lung pathologies. Source codes are +available at https://github.com/Huiyu-Li/GMIA. + +
+
+ comment: Conference +
+
+
+
+
+ + ☆ Relation U-Net + + +
+ Towards clinical interpretations, this paper presents a new +''output-with-confidence'' segmentation neural network with multiple input +images and multiple output segmentation maps and their pairwise relations. A +confidence score of the test image without ground-truth can be estimated from +the difference among the estimated relation maps. We evaluate the method based +on the widely used vanilla U-Net for segmentation and our new model is named +Relation U-Net which can output segmentation maps of the input images as well +as an estimated confidence score of the test image without ground-truth. +Experimental results on four public datasets show that Relation U-Net can not +only provide better accuracy than vanilla U-Net but also estimate a confidence +score which is linearly correlated to the segmentation accuracy on test images. + +
+
+ comment: ISIB 2025 +
+
+
+
+
+ + ☆ Self Pre-training with Adaptive Mask Autoencoders for Variable-Contrast + 3D Medical Imaging + + +
+ The Masked Autoencoder (MAE) has recently demonstrated effectiveness in +pre-training Vision Transformers (ViT) for analyzing natural images. By +reconstructing complete images from partially masked inputs, the ViT encoder +gathers contextual information to predict the missing regions. This capability +to aggregate context is especially important in medical imaging, where +anatomical structures are functionally and mechanically linked to surrounding +regions. However, current methods do not consider variations in the number of +input images, which is typically the case in real-world Magnetic Resonance (MR) +studies. To address this limitation, we propose a 3D Adaptive Masked +Autoencoders (AMAE) architecture that accommodates a variable number of 3D +input contrasts per subject. A magnetic resonance imaging (MRI) dataset of +45,364 subjects was used for pretraining and a subset of 1648 training, 193 +validation and 215 test subjects were used for finetuning. The performance +demonstrates that self pre-training of this adaptive masked autoencoders can +enhance the infarct segmentation performance by 2.8%-3.7% for ViT-based +segmentation models. + +
+
+ comment: 5 pages, ISBI 2025 accepted +
+
+
+
+
+ + ☆ Salient Information Preserving Adversarial Training Improves Clean and + Robust Accuracy + + +
+ In this work we introduce Salient Information Preserving Adversarial Training +(SIP-AT), an intuitive method for relieving the robustness-accuracy trade-off +incurred by traditional adversarial training. SIP-AT uses salient image regions +to guide the adversarial training process in such a way that fragile features +deemed meaningful by an annotator remain unperturbed during training, allowing +models to learn highly predictive non-robust features without sacrificing +overall robustness. This technique is compatible with both human-based and +automatically generated salience estimates, allowing SIP-AT to be used as a +part of human-driven model development without forcing SIP-AT to be reliant +upon additional human data. We perform experiments across multiple datasets and +architectures and demonstrate that SIP-AT is able to boost the clean accuracy +of models while maintaining a high degree of robustness against attacks at +multiple epsilon levels. We complement our central experiments with an +observational study measuring the rate at which human subjects successfully +identify perturbed images. This study helps build a more intuitive +understanding of adversarial attack strength and demonstrates the heightened +importance of low-epsilon robustness. Our results demonstrate the efficacy of +SIP-AT and provide valuable insight into the risks posed by adversarial samples +of various strengths. + +
+
+
+
+
+ + ☆ SHYI: Action Support for Contrastive Learning in High-Fidelity + Text-to-Image Generation + + +
+ In this project, we address the issue of infidelity in text-to-image +generation, particularly for actions involving multiple objects. For this we +build on top of the CONFORM framework which uses Contrastive Learning to +improve the accuracy of the generated image for multiple objects. However the +depiction of actions which involves multiple different object has still large +room for improvement. To improve, we employ semantically hypergraphic +contrastive adjacency learning, a comprehension of enhanced contrastive +structure and "contrast but link" technique. We further amend Stable +Diffusion's understanding of actions by InteractDiffusion. As evaluation +metrics we use image-text similarity CLIP and TIFA. In addition, we conducted a +user study. + Our method shows promising results even with verbs that Stable Diffusion +understands mediocrely. We then provide future directions by analyzing the +results. + Our codebase can be found on polybox under the link: +https://polybox.ethz.ch/index.php/s/dJm3SWyRohUrFxn + +
+
+ comment: Main content 4 pages +
+
+
+
+
+ + ♻ ☆ T2V-CompBench: A Comprehensive Benchmark for Compositional Text-to-video + Generation + + +
+ Text-to-video (T2V) generative models have advanced significantly, yet their +ability to compose different objects, attributes, actions, and motions into a +video remains unexplored. Previous text-to-video benchmarks also neglect this +important ability for evaluation. In this work, we conduct the first systematic +study on compositional text-to-video generation. We propose T2V-CompBench, the +first benchmark tailored for compositional text-to-video generation. +T2V-CompBench encompasses diverse aspects of compositionality, including +consistent attribute binding, dynamic attribute binding, spatial relationships, +motion binding, action binding, object interactions, and generative numeracy. +We further carefully design evaluation metrics of multimodal large language +model (MLLM)-based, detection-based, and tracking-based metrics, which can +better reflect the compositional text-to-video generation quality of seven +proposed categories with 1400 text prompts. The effectiveness of the proposed +metrics is verified by correlation with human evaluations. We also benchmark +various text-to-video generative models and conduct in-depth analysis across +different models and various compositional categories. We find that +compositional text-to-video generation is highly challenging for current +models, and we hope our attempt could shed light on future research in this +direction. + +
+
+ comment: Project page: https://t2v-compbench-2025.github.io/ Code: + https://github.com/KaiyueSun98/T2V-CompBench/tree/V2 +
+
+
+
+
+ + ♻ ☆ DeblurDiNAT: A Compact Model with Exceptional Generalization and Visual + Fidelity on Unseen Domains + + +
+ Recent deblurring networks have effectively restored clear images from the +blurred ones. However, they often struggle with generalization to unknown +domains. Moreover, these models typically focus on distortion metrics such as +PSNR and SSIM, neglecting the critical aspect of metrics aligned with human +perception. To address these limitations, we propose DeblurDiNAT, a deblurring +Transformer based on Dilated Neighborhood Attention. First, DeblurDiNAT employs +an alternating dilation factor paradigm to capture both local and global +blurred patterns, enhancing generalization and perceptual clarity. Second, a +local cross-channel learner aids the Transformer block to understand the +short-range relationships between adjacent channels. Additionally, we present a +linear feed-forward network with a simple while effective design. Finally, a +dual-stage feature fusion module is introduced as an alternative to the +existing approach, which efficiently process multi-scale visual information +across network levels. Compared to state-of-the-art models, our compact +DeblurDiNAT demonstrates superior generalization capabilities and achieves +remarkable performance in perceptual metrics, while maintaining a favorable +model size. + +
+
+
+
+
+ + ♻ ☆ Click-Calib: A Robust Extrinsic Calibration Method for Surround-View + Systems + + +
+ Surround-View System (SVS) is an essential component in Advanced Driver +Assistance System (ADAS) and requires precise calibrations. However, +conventional offline extrinsic calibration methods are cumbersome and +time-consuming as they rely heavily on physical patterns. Additionally, these +methods primarily focus on short-range areas surrounding the vehicle, resulting +in lower calibration quality in more distant zones. To address these +limitations, we propose Click-Calib, a pattern-free approach for offline SVS +extrinsic calibration. Without requiring any special setup, the user only needs +to click a few keypoints on the ground in natural scenes. Unlike other offline +calibration approaches, Click-Calib optimizes camera poses over a wide range by +minimizing reprojection distance errors of keypoints, thereby achieving +accurate calibrations at both short and long distances. Furthermore, +Click-Calib supports both single-frame and multiple-frame modes, with the +latter offering even better results. Evaluations on our in-house dataset and +the public WoodScape dataset demonstrate its superior accuracy and robustness +compared to baseline methods. Code is available at +https://github.com/lwangvaleo/click_calib. + +
+
+
+
+
+ + ♻ ☆ A General Framework for Inference-time Scaling and Steering of Diffusion + Models + + +
+ Diffusion models produce impressive results in modalities ranging from images +and video to protein design and text. However, generating samples with +user-specified properties remains a challenge. Recent research proposes +fine-tuning models to maximize rewards that capture desired properties, but +these methods require expensive training and are prone to mode collapse. In +this work, we propose Feynman Kac (FK) steering, an inference-time framework +for steering diffusion models with reward functions. FK steering works by +sampling a system of multiple interacting diffusion processes, called +particles, and resampling particles at intermediate steps based on scores +computed using functions called potentials. Potentials are defined using +rewards for intermediate states and are selected such that a high value +indicates that the particle will yield a high-reward sample. We explore various +choices of potentials, intermediate rewards, and samplers. We evaluate FK +steering on text-to-image and text diffusion models. For steering text-to-image +models with a human preference reward, we find that FK steering a 0.8B +parameter model outperforms a 2.6B parameter fine-tuned model on prompt +fidelity, with faster sampling and no training. For steering text diffusion +models with rewards for text quality and specific text attributes, we find that +FK steering generates lower perplexity, more linguistically acceptable outputs +and enables gradient-free control of attributes like toxicity. Our results +demonstrate that inference-time scaling and steering of diffusion models, even +with off-the-shelf rewards, can provide significant sample quality gains and +controllability benefits. Code is available at +https://github.com/zacharyhorvitz/Fk-Diffusion-Steering . + +
+
+
+
+
+ + ♻ ☆ SA-MLP: A Low-Power Multiplication-Free Deep Network for 3D Point Cloud + Classification in Resource-Constrained Environments + + +
+ Point cloud classification plays a crucial role in the processing and +analysis of data from 3D sensors such as LiDAR, which are commonly used in +applications like autonomous vehicles, robotics, and environmental monitoring. +However, traditional neural networks, which rely heavily on multiplication +operations, often face challenges in terms of high computational costs and +energy consumption. This study presents a novel family of efficient MLP-based +architectures designed to improve the computational efficiency of point cloud +classification tasks in sensor systems. The baseline model, Mul-MLP, utilizes +conventional multiplication operations, while Add-MLP and Shift-MLP replace +multiplications with addition and shift operations, respectively. These +replacements leverage more sensor-friendly operations that can significantly +reduce computational overhead, making them particularly suitable for +resource-constrained sensor platforms. To further enhance performance, we +propose SA-MLP, a hybrid architecture that alternates between shift and adder +layers, preserving the network depth while optimizing computational efficiency. +Unlike previous approaches such as ShiftAddNet, which increase the layer count +and limit representational capacity by freezing shift weights, SA-MLP fully +exploits the complementary advantages of shift and adder layers by employing +distinct learning rates and optimizers. Experimental results show that Add-MLP +and Shift-MLP achieve competitive performance compared to Mul-MLP, while SA-MLP +surpasses the baseline, delivering results comparable to state-of-the-art MLP +models in terms of both classification accuracy and computational efficiency. +This work offers a promising, energy-efficient solution for sensor-driven +applications requiring real-time point cloud classification, particularly in +environments with limited computational resources. + +
+
+
+
+
+ + ♻ ☆ A design of Convolutional Neural Network model for the Diagnosis of the + COVID-19 + + +
+ With the spread of COVID-19 around the globe over the past year, the usage of +artificial intelligence (AI) algorithms and image processing methods to analyze +the X-ray images of patients' chest with COVID-19 has become essential. The +COVID-19 virus recognition in the lung area of a patient is one of the basic +and essential needs of clicical centers and hospitals. Most research in this +field has been devoted to papers on the basis of deep learning methods +utilizing CNNs (Convolutional Neural Network), which mainly deal with the +screening of sick and healthy people.In this study, a new structure of a +19-layer CNN has been recommended for accurately recognition of the COVID-19 +from the X-ray pictures of chest. The offered CNN is developed to serve as a +precise diagnosis system for a three class (viral pneumonia, Normal, COVID) and +a four classclassification (Lung opacity, Normal, COVID-19, and pneumonia). A +comparison is conducted among the outcomes of the offered procedure and some +popular pretrained networks, including Inception, Alexnet, ResNet50, +Squeezenet, and VGG19 and based on Specificity, Accuracy, Precision, +Sensitivity, Confusion Matrix, and F1-score. The experimental results of the +offered CNN method specify its dominance over the existing published +procedures. This method can be a useful tool for clinicians in deciding +properly about COVID-19. + +
+
+ comment: Important mistakes found. There's no new version currently. Also + contradiction with authorship +
+
+
+
+
+ + ♻ ☆ Compression with Global Guidance: Towards Training-free High-Resolution + MLLMs Acceleration + + +
+ Multimodal large language models (MLLMs) have attracted considerable +attention due to their exceptional performance in visual content understanding +and reasoning. However, their inference efficiency has been a notable concern, +as the increasing length of multimodal contexts leads to quadratic complexity. +Token compression techniques, which reduce the number of visual tokens, have +demonstrated their effectiveness in reducing computational costs. Yet, these +approaches have struggled to keep pace with the rapid advancements in MLLMs, +especially the AnyRes strategy in the context of high-resolution image +understanding. In this paper, we propose a novel token compression method, +GlobalCom$^2$, tailored for high-resolution MLLMs that receive both the +thumbnail and multiple crops. GlobalCom$^2$ treats the tokens derived from the +thumbnail as the "commander" of the entire token compression process, directing +the allocation of retention ratios and the specific compression for each crop. +In this way, redundant tokens are eliminated while important local details are +adaptively preserved to the highest extent feasible. Empirical results across +10 benchmarks reveal that GlobalCom$^2$ achieves an optimal balance between +performance and efficiency, and consistently outperforms state-of-the-art token +compression methods with LLaVA-NeXT-7B/13B models. Our code is released at +https://github.com/xuyang-liu16/GlobalCom2. + +
+
+ comment: Our code is released at + \url{https://github.com/xuyang-liu16/GlobalCom2} +
+
+
+
+
+ + ♻ ☆ Identifying Spurious Correlations using Counterfactual Alignment + + +
+ Models driven by spurious correlations often yield poor generalization +performance. We propose the counterfactual (CF) alignment method to detect and +quantify spurious correlations of black box classifiers. Our methodology is +based on counterfactual images generated with respect to one classifier being +input into other classifiers to see if they also induce changes in the outputs +of these classifiers. The relationship between these responses can be +quantified and used to identify specific instances where a spurious correlation +exists. This is validated by observing intuitive trends in face-attribute and +waterbird classifiers, as well as by fabricating spurious correlations and +detecting their presence, both visually and quantitatively. Furthermore, +utilizing the CF alignment method, we demonstrate that we can evaluate robust +optimization methods (GroupDRO, JTT, and FLAC) by detecting a reduction in +spurious correlations. + +
+
+ comment: Accepted to Transactions on Machine Learning Research (TMLR), Code: + https://github.com/ieee8023/latentshift +
+
+
+
+
+ + ♻ ☆ PACE: Marrying generalization in PArameter-efficient fine-tuning with + Consistency rEgularization NeurIPS 2024 + + +
+ Parameter-Efficient Fine-Tuning (PEFT) effectively adapts pre-trained +transformers to downstream tasks. However, the optimization of tasks +performance often comes at the cost of generalizability in fine-tuned models. +To address this issue, we theoretically connect smaller weight gradient norms +during training and larger datasets to the improvements in model +generalization. Motivated by this connection, we propose reducing gradient +norms for enhanced generalization and aligning fine-tuned model with the +pre-trained counterpart to retain knowledge from large-scale pre-training data. +Yet, naive alignment does not guarantee gradient reduction and can potentially +cause gradient explosion, complicating efforts to manage gradients. To address +such an issue, we propose PACE, marrying generalization of PArameter-efficient +fine-tuning with Consistency rEgularization. We perturb features learned from +the adapter with the multiplicative noise and ensure the fine-tuned model +remains consistent for same sample under different perturbations. Theoretical +analysis shows that PACE not only implicitly regularizes gradients for enhanced +generalization, but also implicitly aligns the fine-tuned and pre-trained +models to retain knowledge. Experimental evidence supports our theories. PACE +surpasses existing PEFT methods in visual adaptation tasks (VTAB-1k, FGVC, +few-shot learning, domain adaptation) showcasing its potential for +resource-efficient fine-tuning. It also improves LoRA in text classification +(GLUE) and mathematical reasoning (GSM-8K). The code is available at +https://github.com/MaxwellYaoNi/PACE + +
+
+ comment: Accepted by NeurIPS 2024 as a spotlight +
+
+
+
+
+ + ♻ ☆ TextSleuth: Towards Explainable Tampered Text Detection + + +
+ Recently, tampered text detection has attracted increasing attention due to +its essential role in information security. Although existing methods can +detect the tampered text region, the interpretation of such detection remains +unclear, making the prediction unreliable. To address this problem, we propose +to explain the basis of tampered text detection with natural language via large +multimodal models. To fill the data gap for this task, we propose a +large-scale, comprehensive dataset, ETTD, which contains both pixel-level +annotations for tampered text region and natural language annotations +describing the anomaly of the tampered text. Multiple methods are employed to +improve the quality of the proposed data. For example, elaborate queries are +introduced to generate high-quality anomaly descriptions with GPT4o. A fused +mask prompt is proposed to reduce confusion when querying GPT4o to generate +anomaly descriptions. To automatically filter out low-quality annotations, we +also propose to prompt GPT4o to recognize tampered texts before describing the +anomaly, and to filter out the responses with low OCR accuracy. To further +improve explainable tampered text detection, we propose a simple yet effective +model called TextSleuth, which achieves improved fine-grained perception and +cross-domain generalization by focusing on the suspected region, with a +two-stage analysis paradigm and an auxiliary grounding prompt. Extensive +experiments on both the ETTD dataset and the public dataset have verified the +effectiveness of the proposed methods. In-depth analysis is also provided to +inspire further research. Our dataset and code will be open-source. + +
+
+ comment: The first work for explainable tampered text detection +
+
+
+
+
+ + ♻ ☆ A Foundation Language-Image Model of the Retina (FLAIR): Encoding Expert + Knowledge in Text Supervision + + +
+ Foundation vision-language models are currently transforming computer vision, +and are on the rise in medical imaging fueled by their very promising +generalization capabilities. However, the initial attempts to transfer this new +paradigm to medical imaging have shown less impressive performances than those +observed in other domains, due to the significant domain shift and the complex, +expert domain knowledge inherent to medical-imaging tasks. Motivated by the +need for domain-expert foundation models, we present FLAIR, a pre-trained +vision-language model for universal retinal fundus image understanding. To this +end, we compiled 38 open-access, mostly categorical fundus imaging datasets +from various sources, with up to 101 different target conditions and 288,307 +images. We integrate the expert's domain knowledge in the form of descriptive +textual prompts, during both pre-training and zero-shot inference, enhancing +the less-informative categorical supervision of the data. Such a textual +expert's knowledge, which we compiled from the relevant clinical literature and +community standards, describes the fine-grained features of the pathologies as +well as the hierarchies and dependencies between them. We report comprehensive +evaluations, which illustrate the benefit of integrating expert knowledge and +the strong generalization capabilities of FLAIR under difficult scenarios with +domain shifts or unseen categories. When adapted with a lightweight linear +probe, FLAIR outperforms fully-trained, dataset-focused models, more so in the +few-shot regimes. Interestingly, FLAIR outperforms by a wide margin +larger-scale generalist image-language models and retina domain-specific +self-supervised networks, which emphasizes the potential of embedding experts' +domain knowledge and the limitations of generalist models in medical imaging. + +
+
+ comment: Accepted in Medical Image Analysis. The pre-trained model is + available at: https://github.com/jusiro/FLAIR +
+
+
+
+
+ + ♻ ☆ MADiff: Text-Guided Fashion Image Editing with Mask Prediction and + Attention-Enhanced Diffusion + + +
+ Text-guided image editing model has achieved great success in general domain. +However, directly applying these models to the fashion domain may encounter two +issues: (1) Inaccurate localization of editing region; (2) Weak editing +magnitude. To address these issues, the MADiff model is proposed. Specifically, +to more accurately identify editing region, the MaskNet is proposed, in which +the foreground region, densepose and mask prompts from large language model are +fed into a lightweight UNet to predict the mask for editing region. To +strengthen the editing magnitude, the Attention-Enhanced Diffusion Model is +proposed, where the noise map, attention map, and the mask from MaskNet are fed +into the proposed Attention Processor to produce a refined noise map. By +integrating the refined noise map into the diffusion model, the edited image +can better align with the target prompt. Given the absence of benchmarks in +fashion image editing, we constructed a dataset named Fashion-E, comprising +28390 image-text pairs in the training set, and 2639 image-text pairs for four +types of fashion tasks in the evaluation set. Extensive experiments on +Fashion-E demonstrate that our proposed method can accurately predict the mask +of editing region and significantly enhance editing magnitude in fashion image +editing compared to the state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Industrial Anomaly Detection and Localization Using Weakly-Supervised + Residual Transformers + + +
+ Recent advancements in industrial anomaly detection (AD) have demonstrated +that incorporating a small number of anomalous samples during training can +significantly enhance accuracy. However, this improvement often comes at the +cost of extensive annotation efforts, which are impractical for many real-world +applications. In this paper, we introduce a novel framework, Weak}ly-supervised +RESidual Transformer (WeakREST), designed to achieve high anomaly detection +accuracy while minimizing the reliance on manual annotations. First, we +reformulate the pixel-wise anomaly localization task into a block-wise +classification problem. Second, we introduce a residual-based feature +representation called Positional Fast Anomaly Residuals (PosFAR) which captures +anomalous patterns more effectively. To leverage this feature, we adapt the +Swin Transformer for enhanced anomaly detection and localization. Additionally, +we propose a weak annotation approach, utilizing bounding boxes and image tags +to define anomalous regions. This approach establishes a semi-supervised +learning context that reduces the dependency on precise pixel-level labels. To +further improve the learning process, we develop a novel ResMixMatch algorithm, +capable of handling the interplay between weak labels and residual-based +representations. + On the benchmark dataset MVTec-AD, our method achieves an Average Precision +(AP) of $83.0\%$, surpassing the previous best result of $82.7\%$ in the +unsupervised setting. In the supervised AD setting, WeakREST attains an AP of +$87.6\%$, outperforming the previous best of $86.0\%$. Notably, even when using +weaker annotations such as bounding boxes, WeakREST exceeds the performance of +leading methods relying on pixel-wise supervision, achieving an AP of $87.1\%$ +compared to the prior best of $86.0\%$ on MVTec-AD. + +
+
+ comment: 13 pages,7 figures +
+
+
+
+
+ + ♻ ☆ The Surprising Ineffectiveness of Pre-Trained Visual Representations for + Model-Based Reinforcement Learning NeurIPS 2024 + + +
+ Visual Reinforcement Learning (RL) methods often require extensive amounts of +data. As opposed to model-free RL, model-based RL (MBRL) offers a potential +solution with efficient data utilization through planning. Additionally, RL +lacks generalization capabilities for real-world tasks. Prior work has shown +that incorporating pre-trained visual representations (PVRs) enhances sample +efficiency and generalization. While PVRs have been extensively studied in the +context of model-free RL, their potential in MBRL remains largely unexplored. +In this paper, we benchmark a set of PVRs on challenging control tasks in a +model-based RL setting. We investigate the data efficiency, generalization +capabilities, and the impact of different properties of PVRs on the performance +of model-based agents. Our results, perhaps surprisingly, reveal that for MBRL +current PVRs are not more sample efficient than learning representations from +scratch, and that they do not generalize better to out-of-distribution (OOD) +settings. To explain this, we analyze the quality of the trained dynamics +model. Furthermore, we show that data diversity and network architecture are +the most important contributors to OOD generalization performance. + +
+
+ comment: Published at the 38th Conference on Neural Information Processing + Systems (NeurIPS 2024). Project page: https://schneimo.com/pvr4mbrl/ +
+
+
+
+
+ + ♻ ☆ CGCOD: Class-Guided Camouflaged Object Detection + + +
+ Camouflaged Object Detection (COD) aims to identify objects that blend +seamlessly into their surroundings. The inherent visual complexity of +camouflaged objects, including their low contrast with the background, diverse +textures, and subtle appearance variations, often obscures semantic cues, +making accurate segmentation highly challenging. Existing methods primarily +rely on visual features, which are insufficient to handle the variability and +intricacy of camouflaged objects, leading to unstable object perception and +ambiguous segmentation results. To tackle these limitations, we introduce a +novel task, class-guided camouflaged object detection (CGCOD), which extends +traditional COD task by incorporating object-specific class knowledge to +enhance detection robustness and accuracy. To facilitate this task, we present +a new dataset, CamoClass, comprising real-world camouflaged objects with class +annotations. Furthermore, we propose a multi-stage framework, CGNet, which +incorporates a plug-and-play class prompt generator and a simple yet effective +class-guided detector. This establishes a new paradigm for COD, bridging the +gap between contextual understanding and class-guided detection. Extensive +experimental results demonstrate the effectiveness of our flexible framework in +improving the performance of proposed and existing detectors by leveraging +class-level textual information. + +
+
+
+
+
+ + ♻ ☆ Evaluation of radiomic feature harmonization techniques for benign and + malignant pulmonary nodules + + +
+ BACKGROUND: Radiomics provides quantitative features of pulmonary nodules +(PNs) which could aid lung cancer diagnosis, but medical image acquisition +variability is an obstacle to clinical application. Acquisition effects may +differ between radiomic features from benign vs. malignant PNs. PURPOSE: We +evaluated how to account for differences between benign and malignant PNs when +correcting radiomic features' acquisition dependency. METHODS: We used 567 +chest CT scans grouped as benign, malignant, or lung cancer screening (mixed +benign, malignant). ComBat harmonization was applied to extracted features for +variation in 4 acquisition parameters. We compared: harmonizing without +distinction, harmonizing with a covariate to preserve distinctions between +subgroups, and harmonizing subgroups separately. Significant ($p\le0.05$) +Kruskal-Wallis tests showed whether harmonization removed acquisition +dependency. A LASSO-SVM pipeline was trained on successfully harmonized +features to predict malignancy. To evaluate predictive information in these +features, the trained harmonization estimators and predictive model were +applied to unseen test sets. Harmonization and predictive performance were +assessed for 10 trials of 5-fold cross-validation. RESULTS: An average 2.1% of +features (95% CI:1.9-2.4%) were acquisition-independent when harmonized without +distinction, 27.3% (95% CI:25.7-28.9%) when harmonized with a covariate, and +90.9% (95% CI:90.4-91.5%) when harmonized separately. Data harmonized +separately or with a covariate trained models with higher ROC-AUC for screening +scans than data harmonized without distinction between benign and malignant PNs +(Delong test, adjusted $p\le0.05$). CONCLUSIONS: Radiomic features of benign +and malignant PNs need different corrective transformations to recover +acquisition-independent distributions. This can be done by harmonizing +separately or with a covariate. + +
+
+ comment: 15 pages, 3 figures, plus supplemental material; updated author list, + corrected result in paragraph 3 of Discussion, updated Figure S1 +
+
+
+
+
+ + ♻ ☆ Structural damage detection via hierarchical damage information with + volumetric assessment + + +
+ Structural health monitoring (SHM) is essential for ensuring the safety and +longevity of infrastructure, but complex image environments, noisy labels, and +reliance on manual damage assessments often hinder its effectiveness. This +study introduces the Guided Detection Network (Guided-DetNet), a framework +designed to address these challenges. Guided-DetNet is characterized by a +Generative Attention Module (GAM), Hierarchical Elimination Algorithm (HEA), +and Volumetric Contour Visual Assessment (VCVA). GAM leverages cross-horizontal +and cross-vertical patch merging and cross-foreground-background feature fusion +to generate varied features to mitigate complex image environments. HEA +addresses noisy labeling using hierarchical relationships among classes to +refine instances given an image by eliminating unlikely class instances. VCVA +assesses the severity of detected damages via volumetric representation and +quantification leveraging the Dirac delta distribution. A comprehensive +quantitative study and two robustness tests were conducted using the PEER Hub +dataset, and a drone-based application, which involved a field experiment, was +conducted to substantiate Guided-DetNet's promising performances. In triple +classification tasks, the framework achieved 96% accuracy, surpassing +state-of-the-art classifiers by up to 3%. In dual detection tasks, it +outperformed competitive detectors with a precision of 94% and a mean average +precision (mAP) of 79% while maintaining a frame rate of 57.04fps, suitable for +real-time applications. Additionally, robustness tests demonstrated resilience +under adverse conditions, with precision scores ranging from 79% to 91%. +Guided-DetNet is established as a robust and efficient framework for SHM, +offering advancements in automation and precision, with the potential for +widespread application in drone-based infrastructure inspections. + +
+
+
+
+
+ + ♻ ☆ SemTalk: Holistic Co-speech Motion Generation with Frame-level Semantic + Emphasis + + +
+ A good co-speech motion generation cannot be achieved without a careful +integration of common rhythmic motion and rare yet essential semantic motion. +In this work, we propose SemTalk for holistic co-speech motion generation with +frame-level semantic emphasis. Our key insight is to separately learn general +motions and sparse motions, and then adaptively fuse them. In particular, +rhythmic consistency learning is explored to establish rhythm-related base +motion, ensuring a coherent foundation that synchronizes gestures with the +speech rhythm. Subsequently, textit{semantic emphasis learning is designed to +generate semantic-aware sparse motion, focusing on frame-level semantic cues. +Finally, to integrate sparse motion into the base motion and generate +semantic-emphasized co-speech gestures, we further leverage a learned semantic +score for adaptive synthesis. Qualitative and quantitative comparisons on two +public datasets demonstrate that our method outperforms the state-of-the-art, +delivering high-quality co-speech motion with enhanced semantic richness over a +stable base motion. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ ACE++: Instruction-Based Image Creation and Editing via Context-Aware + Content Filling + + +
+ We report ACE++, an instruction-based diffusion framework that tackles +various image generation and editing tasks. Inspired by the input format for +the inpainting task proposed by FLUX.1-Fill-dev, we improve the Long-context +Condition Unit (LCU) introduced in ACE and extend this input paradigm to any +editing and generation tasks. To take full advantage of image generative +priors, we develop a two-stage training scheme to minimize the efforts of +finetuning powerful text-to-image diffusion models like FLUX.1-dev. In the +first stage, we pre-train the model using task data with the 0-ref tasks from +the text-to-image model. There are many models in the community based on the +post-training of text-to-image foundational models that meet this training +paradigm of the first stage. For example, FLUX.1-Fill-dev deals primarily with +painting tasks and can be used as an initialization to accelerate the training +process. In the second stage, we finetune the above model to support the +general instructions using all tasks defined in ACE. To promote the widespread +application of ACE++ in different scenarios, we provide a comprehensive set of +models that cover both full finetuning and lightweight finetuning, while +considering general applicability and applicability in vertical scenarios. The +qualitative analysis showcases the superiority of ACE++ in terms of generating +image quality and prompt following ability. Code and models will be available +on the project page: https://ali-vilab. github.io/ACE_plus_page/. + +
+
+
+
+
+ + ♻ ☆ Solving Energy-Independent Density for CT Metal Artifact Reduction via + Neural Representation + + +
+ X-ray CT often suffers from shadowing and streaking artifacts in the presence +of metallic materials, which severely degrade imaging quality. Physically, the +linear attenuation coefficients (LACs) of metals vary significantly with X-ray +energy, causing a nonlinear beam hardening effect (BHE) in CT measurements. +Reconstructing CT images from metal-corrupted measurements consequently becomes +a challenging nonlinear inverse problem. Existing state-of-the-art (SOTA) metal +artifact reduction (MAR) algorithms rely on supervised learning with numerous +paired CT samples. While promising, these supervised methods often assume that +the unknown LACs are energy-independent, ignoring the energy-induced BHE, which +results in limited generalization. Moreover, the requirement for large datasets +also limits their applications in real-world scenarios. In this work, we +propose Density neural representation (Diner), a novel unsupervised MAR method. +Our key innovation lies in formulating MAR as an energy-independent density +reconstruction problem that strictly adheres to the photon-tissue absorption +physical model. This model is inherently nonlinear and complex, making it a +rarely considered approach in inverse imaging problems. By introducing the +water-equivalent tissues approximation and a new polychromatic model to +characterize the nonlinear CT acquisition process, we directly learn the neural +representation of the density map from raw measurements without using external +training data. This energy-independent density reconstruction framework +fundamentally resolves the nonlinear BHE, enabling superior MAR performance +across a wide range of scanning scenarios. Extensive experiments on both +simulated and real-world datasets demonstrate the superiority of our +unsupervised Diner over popular supervised methods in terms of MAR performance +and robustness. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ 3VL: Using Trees to Improve Vision-Language Models' Interpretability + + +
+ Vision-Language models (VLMs) have proven to be effective at aligning image +and text representations, producing superior zero-shot results when transferred +to many downstream tasks. However, these representations suffer from some key +shortcomings in understanding Compositional Language Concepts (CLC), such as +recognizing objects' attributes, states, and relations between different +objects. Moreover, VLMs typically have poor interpretability, making it +challenging to debug and mitigate compositional-understanding failures. In this +work, we introduce the architecture and training technique of Tree-augmented +Vision-Language (3VL) model accompanied by our proposed Anchor inference method +and Differential Relevance (DiRe) interpretability tool. By expanding the text +of an arbitrary image-text pair into a hierarchical tree structure using +language analysis tools, 3VL allows the induction of this structure into the +visual representation learned by the model, enhancing its interpretability and +compositional reasoning. Additionally, we show how Anchor, a simple technique +for text unification, can be used to filter nuisance factors while increasing +CLC understanding performance, e.g., on the fundamental VL-Checklist benchmark. +We also show how DiRe, which performs a differential comparison between VLM +relevancy maps, enables us to generate compelling visualizations of the reasons +for a model's success or failure. Our code is available at: +https://github.com/niryellinek/3VL. + +
+
+ comment: accepted to IEEE TIP +
+
+
+
+
+ + ♻ ☆ When No-Reference Image Quality Models Meet MAP Estimation in Diffusion + Latents + + +
+ Contemporary no-reference image quality assessment (NR-IQA) models can +effectively quantify perceived image quality, often achieving strong +correlations with human perceptual scores on standard IQA benchmarks. Yet, +limited efforts have been devoted to treating NR-IQA models as natural image +priors for real-world image enhancement, and consequently comparing them from a +perceptual optimization standpoint. In this work, we show -- for the first time +-- that NR-IQA models can be plugged into the maximum a posteriori (MAP) +estimation framework for image enhancement. This is achieved by performing +gradient ascent in the diffusion latent space rather than in the raw pixel +domain, leveraging a pretrained differentiable and bijective diffusion process. +Likely, different NR-IQA models lead to different enhanced outputs, which in +turn provides a new computational means of comparing them. Unlike conventional +correlation-based measures, our comparison method offers complementary insights +into the respective strengths and weaknesses of the competing NR-IQA models in +perceptual optimization scenarios. Additionally, we aim to improve the +best-performing NR-IQA model in diffusion latent MAP estimation by +incorporating the advantages of other top-performing methods. The resulting +model delivers noticeably better results in enhancing real-world images +afflicted by unknown and complex distortions, all preserving a high degree of +image fidelity. + +
+
+
+
+
+ + ♻ ☆ Sports-QA: A Large-Scale Video Question Answering Benchmark for Complex + and Professional Sports + + +
+ Reasoning over sports videos for question answering is an important task with +numerous applications, such as player training and information retrieval. +However, this task has not been explored due to the lack of relevant datasets +and the challenging nature it presents. Most datasets for video question +answering (VideoQA) focus mainly on general and coarse-grained understanding of +daily-life videos, which is not applicable to sports scenarios requiring +professional action understanding and fine-grained motion analysis. In this +paper, we introduce the first dataset, named Sports-QA, specifically designed +for the sports VideoQA task. The Sports-QA dataset includes various types of +questions, such as descriptions, chronologies, causalities, and counterfactual +conditions, covering multiple sports. Furthermore, to address the +characteristics of the sports VideoQA task, we propose a new Auto-Focus +Transformer (AFT) capable of automatically focusing on particular scales of +temporal information for question answering. We conduct extensive experiments +on Sports-QA, including baseline studies and the evaluation of different +methods. The results demonstrate that our AFT achieves state-of-the-art +performance. + +
+
+
+
+
+ + ♻ ☆ Maximizing Uncertainty for Federated learning via Bayesian + Optimisation-based Model Poisoning + + +
+ As we transition from Narrow Artificial Intelligence towards Artificial Super +Intelligence, users are increasingly concerned about their privacy and the +trustworthiness of machine learning (ML) technology. A common denominator for +the metrics of trustworthiness is the quantification of uncertainty inherent in +DL algorithms, and specifically in the model parameters, input data, and model +predictions. One of the common approaches to address privacy-related issues in +DL is to adopt distributed learning such as federated learning (FL), where +private raw data is not shared among users. Despite the privacy-preserving +mechanisms in FL, it still faces challenges in trustworthiness. Specifically, +the malicious users, during training, can systematically create malicious model +parameters to compromise the models predictive and generative capabilities, +resulting in high uncertainty about their reliability. To demonstrate malicious +behaviour, we propose a novel model poisoning attack method named Delphi which +aims to maximise the uncertainty of the global model output. We achieve this by +taking advantage of the relationship between the uncertainty and the model +parameters of the first hidden layer of the local model. Delphi employs two +types of optimisation , Bayesian Optimisation and Least Squares Trust Region, +to search for the optimal poisoned model parameters, named as Delphi-BO and +Delphi-LSTR. We quantify the uncertainty using the KL Divergence to minimise +the distance of the predictive probability distribution towards an uncertain +distribution of model output. Furthermore, we establish a mathematical proof +for the attack effectiveness demonstrated in FL. Numerical results demonstrate +that Delphi-BO induces a higher amount of uncertainty than Delphi-LSTR +highlighting vulnerability of FL systems to model poisoning attacks. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ MGF: Mixed Gaussian Flow for Diverse Trajectory Prediction + + +
+ To predict future trajectories, the normalizing flow with a standard Gaussian +prior suffers from weak diversity. The ineffectiveness comes from the conflict +between the fact of asymmetric and multi-modal distribution of likely outcomes +and symmetric and single-modal original distribution and supervision losses. +Instead, we propose constructing a mixed Gaussian prior for a normalizing flow +model for trajectory prediction. The prior is constructed by analyzing the +trajectory patterns in the training samples without requiring extra annotations +while showing better expressiveness and being multi-modal and asymmetric. +Besides diversity, it also provides better controllability for probabilistic +trajectory generation. We name our method Mixed Gaussian Flow (MGF). It +achieves state-of-the-art performance in the evaluation of both trajectory +alignment and diversity on the popular UCY/ETH and SDD datasets. Code is +available at https://github.com/mulplue/MGF. + +
+
+ comment: Accepted by Neurips 2024. Code: https://github.com/mulplue/MGF +
+
+
+
+
+ + ♻ ☆ Mask-guided cross-image attention for zero-shot in-silico + histopathologic image generation with a diffusion model + + +
+ Creating in-silico data with generative AI promises a cost-effective +alternative to staining, imaging, and annotating whole slide images in +computational pathology. Diffusion models are the state-of-the-art solution for +generating in-silico images, offering unparalleled fidelity and realism. Using +appearance transfer diffusion models allows for zero-shot image generation, +facilitating fast application and making model training unnecessary. However +current appearance transfer diffusion models are designed for natural images, +where the main task is to transfer the foreground object from an origin to a +target domain, while the background is of insignificant importance. In +computational pathology, specifically in oncology, it is however not +straightforward to define which objects in an image should be classified as +foreground and background, as all objects in an image may be of critical +importance for the detailed understanding the tumor micro-environment. We +contribute to the applicability of appearance transfer diffusion models to +immunohistochemistry-stained images by modifying the appearance transfer +guidance to alternate between class-specific AdaIN feature statistics matchings +using existing segmentation masks. The performance of the proposed method is +demonstrated on the downstream task of supervised epithelium segmentation, +showing that the number of manual annotations required for model training can +be reduced by 75%, outperforming the baseline approach. Additionally, we +consulted with a certified pathologist to investigate future improvements. We +anticipate this work to inspire the application of zero-shot diffusion models +in computational pathology, providing an efficient method to generate in-silico +images with unmatched fidelity and realism, which prove meaningful for +downstream tasks, such as training existing deep learning models or finetuning +foundation models. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ♻ ☆ RoHan: Robust Hand Detection in Operation Room + + +
+ Hand-specific localization has garnered significant interest within the +computer vision community. Although there are numerous datasets with hand +annotations from various angles and settings, domain transfer techniques +frequently struggle in surgical environments. This is mainly due to the limited +availability of gloved hand instances and the unique challenges of operating +rooms (ORs). Thus, hand-detection models tailored to OR settings require +extensive training and expensive annotation processes. To overcome these +challenges, we present "RoHan" - a novel approach for robust hand detection in +the OR, leveraging advanced semi-supervised domain adaptation techniques to +tackle the challenges of varying recording conditions, diverse glove colors, +and occlusions common in surgical settings. Our methodology encompasses two +main stages: (1) data augmentation strategy that utilizes "Artificial Gloves," +a method for augmenting publicly available hand datasets with synthetic images +of hands-wearing gloves; (2) semi-supervised domain adaptation pipeline that +improves detection performance in real-world OR settings through iterative +prediction refinement and efficient frame filtering. We evaluate our method +using two datasets: simulated enterotomy repair and saphenous vein graft +harvesting. "RoHan" substantially reduces the need for extensive labeling and +model training, paving the way for the practical implementation of hand +detection technologies in medical settings. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ♻ ☆ Diffusion-based Unsupervised Audio-visual Speech Enhancement + + +
+ This paper proposes a new unsupervised audio-visual speech enhancement (AVSE) +approach that combines a diffusion-based audio-visual speech generative model +with a non-negative matrix factorization (NMF) noise model. First, the +diffusion model is pre-trained on clean speech conditioned on corresponding +video data to simulate the speech generative distribution. This pre-trained +model is then paired with the NMF-based noise model to estimate clean speech +iteratively. Specifically, a diffusion-based posterior sampling approach is +implemented within the reverse diffusion process, where after each iteration, a +speech estimate is obtained and used to update the noise parameters. +Experimental results confirm that the proposed AVSE approach not only +outperforms its audio-only counterpart but also generalizes better than a +recent supervised-generative AVSE method. Additionally, the new inference +algorithm offers a better balance between inference speed and performance +compared to the previous diffusion-based method. Code and demo available at: +https://jeaneudesayilo.github.io/fast_UdiffSE + +
+
+
+
+
+ + ♻ ☆ Improving Pain Classification using Spatio-Temporal Deep Learning + Approaches with Facial Expressions + + +
+ Pain management and severity detection are crucial for effective treatment, +yet traditional self-reporting methods are subjective and may be unsuitable for +non-verbal individuals (people with limited speaking skills). To address this +limitation, we explore automated pain detection using facial expressions. Our +study leverages deep learning techniques to improve pain assessment by +analyzing facial images from the Pain Emotion Faces Database (PEMF). We propose +two novel approaches1: (1) a hybrid ConvNeXt model combined with Long +Short-Term Memory (LSTM) blocks to analyze video frames and predict pain +presence, and (2) a Spatio-Temporal Graph Convolution Network (STGCN) +integrated with LSTM to process landmarks from facial images for pain +detection. Our work represents the first use of the PEMF dataset for binary +pain classification and demonstrates the effectiveness of these models through +extensive experimentation. The results highlight the potential of combining +spatial and temporal features for enhanced pain detection, offering a promising +advancement in objective pain assessment methodologies. + +
+
+ comment: 8 pages, 3 figures, 3 tables. Accepted and presented at the 18th + International Conference on Machine Vision (ICMV 2024), Edinburgh, UK +
+
+
+
+
+ + ♻ ☆ Multispectral Pedestrian Detection with Sparsely Annotated Label + + +
+ Although existing Sparsely Annotated Object Detection (SAOD) approches have +made progress in handling sparsely annotated environments in multispectral +domain, where only some pedestrians are annotated, they still have the +following limitations: (i) they lack considerations for improving the quality +of pseudo-labels for missing annotations, and (ii) they rely on fixed ground +truth annotations, which leads to learning only a limited range of pedestrian +visual appearances in the multispectral domain. To address these issues, we +propose a novel framework called Sparsely Annotated Multispectral Pedestrian +Detection (SAMPD). For limitation (i), we introduce Multispectral +Pedestrian-aware Adaptive Weight (MPAW) and Positive Pseudo-label Enhancement +(PPE) module. Utilizing multispectral knowledge, these modules ensure the +generation of high-quality pseudo-labels and enable effective learning by +increasing weights for high-quality pseudo-labels based on modality +characteristics. To address limitation (ii), we propose an Adaptive Pedestrian +Retrieval Augmentation (APRA) module, which adaptively incorporates pedestrian +patches from ground-truth and dynamically integrates high-quality pseudo-labels +with the ground-truth, facilitating a more diverse learning pool of +pedestrians. Extensive experimental results demonstrate that our SAMPD +significantly enhances performance in sparsely annotated environments within +the multispectral domain. + +
+
+ comment: Accepted at AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Approximation properties relative to continuous scale space for hybrid + discretizations of Gaussian derivative operators + + +
+ This paper presents an analysis of properties of two hybrid discretization +methods for Gaussian derivatives, based on convolutions with either the +normalized sampled Gaussian kernel or the integrated Gaussian kernel followed +by central differences. The motivation for studying these discretization +methods is that in situations when multiple spatial derivatives of different +order are needed at the same scale level, they can be computed significantly +more efficiently compared to more direct derivative approximations based on +explicit convolutions with either sampled Gaussian kernels or integrated +Gaussian kernels. + While these computational benefits do also hold for the genuinely discrete +approach for computing discrete analogues of Gaussian derivatives, based on +convolution with the discrete analogue of the Gaussian kernel followed by +central differences, the underlying mathematical primitives for the discrete +analogue of the Gaussian kernel, in terms of modified Bessel functions of +integer order, may not be available in certain frameworks for image processing, +such as when performing deep learning based on scale-parameterized filters in +terms of Gaussian derivatives, with learning of the scale levels. + In this paper, we present a characterization of the properties of these +hybrid discretization methods, in terms of quantitative performance measures +concerning the amount of spatial smoothing that they imply, as well as the +relative consistency of scale estimates obtained from scale-invariant feature +detectors with automatic scale selection, with an emphasis on the behaviour for +very small values of the scale parameter, which may differ significantly from +corresponding results obtained from the fully continuous scale-space theory, as +well as between different types of discretization methods. + +
+
+ comment: 23 pages, 9 figures. arXiv admin note: text overlap with + arXiv:2311.11317 +
+
+
+
+
+ + ♻ ☆ OminiControl: Minimal and Universal Control for Diffusion Transformer + + +
+ In this paper, we introduce OminiControl, a highly versatile and +parameter-efficient framework that integrates image conditions into pre-trained +Diffusion Transformer (DiT) models. At its core, OminiControl leverages a +parameter reuse mechanism, enabling the DiT to encode image conditions using +itself as a powerful backbone and process them with its flexible multi-modal +attention processors. Unlike existing methods, which rely heavily on additional +encoder modules with complex architectures, OminiControl (1) effectively and +efficiently incorporates injected image conditions with only ~0.1% additional +parameters, and (2) addresses a wide range of image conditioning tasks in a +unified manner, including subject-driven generation and spatially-aligned +conditions such as edges, depth, and more. Remarkably, these capabilities are +achieved by training on images generated by the DiT itself, which is +particularly beneficial for subject-driven generation. Extensive evaluations +demonstrate that OminiControl outperforms existing UNet-based and DiT-adapted +models in both subject-driven and spatially-aligned conditional generation. +Additionally, we release our training dataset, Subjects200K, a diverse +collection of over 200,000 identity-consistent images, along with an efficient +data synthesis pipeline to advance research in subject-consistent generation. + +
+
+
+
+
+ + ♻ ☆ CrossFi: A Cross Domain Wi-Fi Sensing Framework Based on Siamese Network + + +
+ In recent years, Wi-Fi sensing has garnered significant attention due to its +numerous benefits, such as privacy protection, low cost, and penetration +ability. Extensive research has been conducted in this field, focusing on areas +such as gesture recognition, people identification, and fall detection. +However, many data-driven methods encounter challenges related to domain shift, +where the model fails to perform well in environments different from the +training data. One major factor contributing to this issue is the limited +availability of Wi-Fi sensing datasets, which makes models learn excessive +irrelevant information and over-fit to the training set. Unfortunately, +collecting large-scale Wi-Fi sensing datasets across diverse scenarios is a +challenging task. To address this problem, we propose CrossFi, a siamese +network-based approach that excels in both in-domain scenario and cross-domain +scenario, including few-shot, zero-shot scenarios, and even works in few-shot +new-class scenario where testing set contains new categories. The core +component of CrossFi is a sample-similarity calculation network called CSi-Net, +which improves the structure of the siamese network by using an attention +mechanism to capture similarity information, instead of simply calculating the +distance or cosine similarity. Based on it, we develop an extra Weight-Net that +can generate a template for each class, so that our CrossFi can work in +different scenarios. Experimental results demonstrate that our CrossFi achieves +state-of-the-art performance across various scenarios. In gesture recognition +task, our CrossFi achieves an accuracy of 98.17% in in-domain scenario, 91.72% +in one-shot cross-domain scenario, 64.81% in zero-shot cross-domain scenario, +and 84.75% in one-shot new-class scenario. The code for our model is publicly +available at https://github.com/RS2002/CrossFi. + +
+
+
+
+
+ + ♻ ☆ Multiple Information Prompt Learning for Cloth-Changing Person + Re-Identification + + +
+ Cloth-changing person re-identification is a subject closer to the real +world, which focuses on solving the problem of person re-identification after +pedestrians change clothes. The primary challenge in this field is to overcome +the complex interplay between intra-class and inter-class variations and to +identify features that remain unaffected by changes in appearance. Sufficient +data collection for model training would significantly aid in addressing this +problem. However, it is challenging to gather diverse datasets in practice. +Current methods focus on implicitly learning identity information from the +original image or introducing additional auxiliary models, which are largely +limited by the quality of the image and the performance of the additional +model. To address these issues, inspired by prompt learning, we propose a novel +multiple information prompt learning (MIPL) scheme for cloth-changing person +ReID, which learns identity robust features through the common prompt guidance +of multiple messages. Specifically, the clothing information stripping (CIS) +module is designed to decouple the clothing information from the original RGB +image features to counteract the influence of clothing appearance. The +Bio-guided attention (BGA) module is proposed to increase the learning +intensity of the model for key information. A dual-length hybrid patch (DHP) +module is employed to make the features have diverse coverage to minimize the +impact of feature bias. Extensive experiments demonstrate that the proposed +method outperforms all state-of-the-art methods on the LTCC, Celeb-reID, +Celeb-reID-light, and CSCC datasets, achieving rank-1 scores of 74.8%, 73.3%, +66.0%, and 88.1%, respectively. When compared to AIM (CVPR23), ACID (TIP23), +and SCNet (MM23), MIPL achieves rank-1 improvements of 11.3%, 13.8%, and 7.9%, +respectively, on the PRCC dataset. + +
+
+
+
+
+ + ♻ ☆ The Silent Majority: Demystifying Memorization Effect in the Presence of + Spurious Correlations + + +
+ Machine learning models often rely on simple spurious features -- patterns in +training data that correlate with targets but are not causally related to them, +like image backgrounds in foreground classification. This reliance typically +leads to imbalanced test performance across minority and majority groups. In +this work, we take a closer look at the fundamental cause of such imbalanced +performance through the lens of memorization, which refers to the ability to +predict accurately on \textit{atypical} examples (minority groups) in the +training set but failing in achieving the same accuracy in the testing set. +This paper systematically shows the ubiquitous existence of spurious features +in a small set of neurons within the network, providing the first-ever evidence +that memorization may contribute to imbalanced group performance. Through three +experimental sources of converging empirical evidence, we find the property of +a small subset of neurons or channels in memorizing minority group information. +Inspired by these findings, we articulate the hypothesis: the imbalanced group +performance is a byproduct of ``noisy'' spurious memorization confined to a +small set of neurons. To further substantiate this hypothesis, we show that +eliminating these unnecessary spurious memorization patterns via a novel +framework during training can significantly affect the model performance on +minority groups. Our experimental results across various architectures and +benchmarks offer new insights on how neural networks encode core and spurious +knowledge, laying the groundwork for future research in demystifying robustness +to spurious correlation. + +
+
+
+
+
+ + ♻ ☆ DATransNet: Dynamic Attention Transformer Network for Infrared Small + Target Detection + + +
+ Infrared small target detection (ISTD) is widely used in civilian and +military applications. However, ISTD encounters several challenges, including +the tendency for small and dim targets to be obscured by complex backgrounds.To +address this issue, we propose the Dynamic Attention Transformer Network +(DATransNet), which aims to extract and preserve edge information of small +targets.DATransNet employs the Dynamic Attention Transformer (DATrans), +simulating central difference convolutions (CDC) to extract and integrate +gradient features with deeper features.Furthermore, we propose a global feature +extraction module (GFEM) that offers a comprehensive perspective to prevent the +network from focusing solely on details while neglecting the background +information. We compare the network with state-of-the-art (SOTA) approaches, +and the results demonstrate that our method performs effectively. Our source +code is available at https://github.com/greekinRoma/DATransNet. + +
+
+
+
+
+ + ♻ ☆ Ultra-High-Definition Image Deblurring via Multi-scale Cubic-Mixer + + +
+ Currently, transformer-based algorithms are making a splash in the domain of +image deblurring. Their achievement depends on the self-attention mechanism +with CNN stem to model long range dependencies between tokens. Unfortunately, +this ear-pleasing pipeline introduces high computational complexity and makes +it difficult to run an ultra-high-definition image on a single GPU in real +time. To trade-off accuracy and efficiency, the input degraded image is +computed cyclically over three dimensional ($C$, $W$, and $H$) signals without +a self-attention mechanism. We term this deep network as Multi-scale +Cubic-Mixer, which is acted on both the real and imaginary components after +fast Fourier transform to estimate the Fourier coefficients and thus obtain a +deblurred image. Furthermore, we combine the multi-scale cubic-mixer with a +slicing strategy to generate high-quality results at a much lower computational +cost. Experimental results demonstrate that the proposed algorithm performs +favorably against the state-of-the-art deblurring approaches on the several +benchmarks and a new ultra-high-definition dataset in terms of accuracy and +speed. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ Zero-shot Video Restoration and Enhancement Using Pre-Trained Image + Diffusion Model + + +
+ Diffusion-based zero-shot image restoration and enhancement models have +achieved great success in various tasks of image restoration and enhancement. +However, directly applying them to video restoration and enhancement results in +severe temporal flickering artifacts. In this paper, we propose the first +framework for zero-shot video restoration and enhancement based on the +pre-trained image diffusion model. By replacing the spatial self-attention +layer with the proposed short-long-range (SLR) temporal attention layer, the +pre-trained image diffusion model can take advantage of the temporal +correlation between frames. We further propose temporal consistency guidance, +spatial-temporal noise sharing, and an early stopping sampling strategy to +improve temporally consistent sampling. Our method is a plug-and-play module +that can be inserted into any diffusion-based image restoration or enhancement +methods to further improve their performance. Experimental results demonstrate +the superiority of our proposed method. Our code is available at +https://github.com/cao-cong/ZVRD. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Continuous Concepts Removal in Text-to-image Diffusion Models + + +
+ Text-to-image diffusion models have shown an impressive ability to generate +high-quality images from input textual descriptions. However, concerns have +been raised about the potential for these models to create content that +infringes on copyrights or depicts disturbing subject matter. Removing specific +concepts from these models is a promising potential solution to this problem. +However, existing methods for concept removal do not work well in practical but +challenging scenarios where concepts need to be continuously removed. +Specifically, these methods lead to poor alignment between the text prompts and +the generated image after the continuous removal process. To address this +issue, we propose a novel approach called CCRT that includes a designed +knowledge distillation paradigm. It constrains the text-image alignment +behavior during the continuous concept removal process by using a set of text +prompts generated through our genetic algorithm, which employs a designed +fuzzing strategy. We conduct extensive experiments involving the removal of +various concepts. The results evaluated through both algorithmic metrics and +human studies demonstrate that our CCRT can effectively remove the targeted +concepts in a continuous manner while maintaining the high generation quality +(e.g., text-image alignment) of the model. + +
+
+
+
+
+ + ♻ ☆ Conformal-in-the-Loop for Learning with Imbalanced Noisy Data + + +
+ Class imbalance and label noise are pervasive in large-scale datasets, yet +much of machine learning research assumes well-labeled, balanced data, which +rarely reflects real world conditions. Existing approaches typically address +either label noise or class imbalance in isolation, leading to suboptimal +results when both issues coexist. In this work, we propose +Conformal-in-the-Loop (CitL), a novel training framework that addresses both +challenges with a conformal prediction-based approach. CitL evaluates sample +uncertainty to adjust weights and prune unreliable examples, enhancing model +resilience and accuracy with minimal computational cost. Our extensive +experiments include a detailed analysis showing how CitL effectively emphasizes +impactful data in noisy, imbalanced datasets. Our results show that CitL +consistently boosts model performance, achieving up to a 6.1% increase in +classification accuracy and a 5.0 mIoU improvement in segmentation. Our code is +publicly available: CitL. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ Investigating the Effect of Network Pruning on Performance and + Interpretability + + +
+ Deep Neural Networks (DNNs) are often over-parameterized for their tasks and +can be compressed quite drastically by removing weights, a process called +pruning. We investigate the impact of different pruning techniques on the +classification performance and interpretability of GoogLeNet. We systematically +apply unstructured and structured pruning, as well as connection sparsity +(pruning of input weights) methods to the network and analyze the outcomes +regarding the network's performance on the validation set of ImageNet. We also +compare different retraining strategies, such as iterative pruning and one-shot +pruning. We find that with sufficient retraining epochs, the performance of the +networks can approximate the performance of the default GoogLeNet - and even +surpass it in some cases. To assess interpretability, we employ the Mechanistic +Interpretability Score (MIS) developed by Zimmermann et al. . Our experiments +reveal that there is no significant relationship between interpretability and +pruning rate when using MIS as a measure. Additionally, we observe that +networks with extremely low accuracy can still achieve high MIS scores, +suggesting that the MIS may not always align with intuitive notions of +interpretability, such as understanding the basis of correct decisions. + +
+
+ comment: 4 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Make-A-Character 2: Animatable 3D Character Generation From a Single + Image + + +
+ This report introduces Make-A-Character 2, an advanced system for generating +high-quality 3D characters from single portrait photographs, ideal for game +development and digital human applications. Make-A-Character 2 builds upon its +predecessor by incorporating several significant improvements for image-based +head generation. We utilize the IC-Light method to correct non-ideal +illumination in input photos and apply neural network-based color correction to +harmonize skin tones between the photos and game engine renders. We also employ +the Hierarchical Representation Network to capture high-frequency facial +structures and conduct adaptive skeleton calibration for accurate and +expressive facial animations. The entire image-to-3D-character generation +process takes less than 2 minutes. Furthermore, we leverage transformer +architecture to generate co-speech facial and gesture actions, enabling +real-time conversation with the generated character. These technologies have +been integrated into our conversational AI avatar products. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ Multi-modal and Multi-scale Spatial Environment Understanding for + Immersive Visual Text-to-Speech + + +
+ Visual Text-to-Speech (VTTS) aims to take the environmental image as the +prompt to synthesize the reverberant speech for the spoken content. The +challenge of this task lies in understanding the spatial environment from the +image. Many attempts have been made to extract global spatial visual +information from the RGB space of an spatial image. However, local and depth +image information are crucial for understanding the spatial environment, which +previous works have ignored. To address the issues, we propose a novel +multi-modal and multi-scale spatial environment understanding scheme to achieve +immersive VTTS, termed M2SE-VTTS. The multi-modal aims to take both the RGB and +Depth spaces of the spatial image to learn more comprehensive spatial +information, and the multi-scale seeks to model the local and global spatial +knowledge simultaneously. Specifically, we first split the RGB and Depth images +into patches and adopt the Gemini-generated environment captions to guide the +local spatial understanding. After that, the multi-modal and multi-scale +features are integrated by the local-aware global spatial understanding. In +this way, M2SE-VTTS effectively models the interactions between local and +global spatial contexts in the multi-modal spatial environment. Objective and +subjective evaluations suggest that our model outperforms the advanced +baselines in environmental speech generation. The code and audio samples are +available at: https://github.com/AI-S2-Lab/M2SE-VTTS. + +
+
+ comment: 9 pages,2 figures, Accepted by AAAI'2025 +
+
+
+
+
+ + ♻ ☆ Multi-Context Temporal Consistent Modeling for Referring Video Object + Segmentation + + +
+ Referring video object segmentation aims to segment objects within a video +corresponding to a given text description. Existing transformer-based temporal +modeling approaches face challenges related to query inconsistency and the +limited consideration of context. Query inconsistency produces unstable masks +of different objects in the middle of the video. The limited consideration of +context leads to the segmentation of incorrect objects by failing to adequately +account for the relationship between the given text and instances. To address +these issues, we propose the Multi-context Temporal Consistency Module (MTCM), +which consists of an Aligner and a Multi-Context Enhancer (MCE). The Aligner +removes noise from queries and aligns them to achieve query consistency. The +MCE predicts text-relevant queries by considering multi-context. We applied +MTCM to four different models, increasing performance across all of them, +particularly achieving 47.6 J&F on the MeViS. Code is available at +https://github.com/Choi58/MTCM. + +
+
+ comment: Comment: Accepted to ICASSP 2025 +
+
+
+
+
+ + ♻ ☆ Adaptive Noise-Tolerant Network for Image Segmentation + + +
+ Unlike image classification and annotation, for which deep network models +have achieved dominating superior performances compared to traditional computer +vision algorithms, deep learning for automatic image segmentation still faces +critical challenges. One of such hurdles is to obtain ground-truth +segmentations as the training labels for deep network training. Especially when +we study biomedical images, such as histopathological images (histo-images), it +is unrealistic to ask for manual segmentation labels as the ground truth for +training due to the fine image resolution as well as the large image size and +complexity. In this paper, instead of relying on clean segmentation labels, we +study whether and how integrating imperfect or noisy segmentation results from +off-the-shelf segmentation algorithms may help achieve better segmentation +results through a new Adaptive Noise-Tolerant Network (ANTN) model. We extend +the noisy label deep learning to image segmentation with two novel aspects: (1) +multiple noisy labels can be integrated into one deep learning model; (2) noisy +segmentation modeling, including probabilistic parameters, is adaptive, +depending on the given testing image appearance. Implementation of the new ANTN +model on both the synthetic data and real-world histo-images demonstrates its +effectiveness and superiority over off-the-shelf and other existing +deep-learning-based image segmentation algorithms. + +
+
+
+
+
+ + ♻ ☆ Efficient Long Video Tokenization via Coordinate-based Patch + Reconstruction + + +
+ Efficient tokenization of videos remains a challenge in training vision +models that can process long videos. One promising direction is to develop a +tokenizer that can encode long video clips, as it would enable the tokenizer to +leverage the temporal coherence of videos better for tokenization. However, +training existing tokenizers on long videos often incurs a huge training cost +as they are trained to reconstruct all the frames at once. In this paper, we +introduce CoordTok, a video tokenizer that learns a mapping from +coordinate-based representations to the corresponding patches of input videos, +inspired by recent advances in 3D generative models. In particular, CoordTok +encodes a video into factorized triplane representations and reconstructs +patches that correspond to randomly sampled $(x,y,t)$ coordinates. This allows +for training large tokenizer models directly on long videos without requiring +excessive training resources. Our experiments show that CoordTok can +drastically reduce the number of tokens for encoding long video clips. For +instance, CoordTok can encode a 128-frame video with 128$\times$128 resolution +into 1280 tokens, while baselines need 6144 or 8192 tokens to achieve similar +reconstruction quality. We further show that this efficient video tokenization +enables memory-efficient training of a diffusion transformer that can generate +128 frames at once. + +
+
+ comment: Code is available on the project webpage: + https://huiwon-jang.github.io/coordtok/ +
+
+
+
+
+ + ♻ ☆ A Unifying Information-theoretic Perspective on Evaluating Generative + Models + + +
+ Considering the difficulty of interpreting generative model output, there is +significant current research focused on determining meaningful evaluation +metrics. Several recent approaches utilize "precision" and "recall," borrowed +from the classification domain, to individually quantify the output fidelity +(realism) and output diversity (representation of the real data variation), +respectively. With the increase in metric proposals, there is a need for a +unifying perspective, allowing for easier comparison and clearer explanation of +their benefits and drawbacks. To this end, we unify a class of +kth-nearest-neighbors (kNN)-based metrics under an information-theoretic lens +using approaches from kNN density estimation. Additionally, we propose a +tri-dimensional metric composed of Precision Cross-Entropy (PCE), Recall +Cross-Entropy (RCE), and Recall Entropy (RE), which separately measure fidelity +and two distinct aspects of diversity, inter- and intra-class. Our +domain-agnostic metric, derived from the information-theoretic concepts of +entropy and cross-entropy, can be dissected for both sample- and mode-level +analysis. Our detailed experimental results demonstrate the sensitivity of our +metric components to their respective qualities and reveal undesirable +behaviors of other metrics. + +
+
+
+
+
+ + ♻ ☆ Enhancing Skin Disease Diagnosis: Interpretable Visual Concept Discovery + with SAM + + +
+ Current AI-assisted skin image diagnosis has achieved dermatologist-level +performance in classifying skin cancer, driven by rapid advancements in deep +learning architectures. However, unlike traditional vision tasks, skin images +in general present unique challenges due to the limited availability of +well-annotated datasets, complex variations in conditions, and the necessity +for detailed interpretations to ensure patient safety. Previous segmentation +methods have sought to reduce image noise and enhance diagnostic performance, +but these techniques require fine-grained, pixel-level ground truth masks for +training. In contrast, with the rise of foundation models, the Segment Anything +Model (SAM) has been introduced to facilitate promptable segmentation, enabling +the automation of the segmentation process with simple yet effective prompts. +Efforts applying SAM predominantly focus on dermatoscopy images, which present +more easily identifiable lesion boundaries than clinical photos taken with +smartphones. This limitation constrains the practicality of these approaches to +real-world applications. To overcome the challenges posed by noisy clinical +photos acquired via non-standardized protocols and to improve diagnostic +accessibility, we propose a novel Cross-Attentive Fusion framework for +interpretable skin lesion diagnosis. Our method leverages SAM to generate +visual concepts for skin diseases using prompts, integrating local visual +concepts with global image features to enhance model performance. Extensive +evaluation on two skin disease datasets demonstrates our proposed method's +effectiveness on lesion diagnosis and interpretability. + +
+
+ comment: This paper is accepted by WACV 2025 +
+
+
+
+
+ + ♻ ☆ GauFRe: Gaussian Deformation Fields for Real-time Dynamic Novel View + Synthesis + + +
+ We propose a method that achieves state-of-the-art rendering quality and +efficiency on monocular dynamic scene reconstruction using deformable 3D +Gaussians. Implicit deformable representations commonly model motion with a +canonical space and time-dependent backward-warping deformation field. Our +method, GauFRe, uses a forward-warping deformation to explicitly model +non-rigid transformations of scene geometry. Specifically, we propose a +template set of 3D Gaussians residing in a canonical space, and a +time-dependent forward-warping deformation field to model dynamic objects. +Additionally, we tailor a 3D Gaussian-specific static component supported by an +inductive bias-aware initialization approach which allows the deformation field +to focus on moving scene regions, improving the rendering of complex real-world +motion. The differentiable pipeline is optimized end-to-end with a +self-supervised rendering loss. Experiments show our method achieves +competitive results and higher efficiency than both previous state-of-the-art +NeRF and Gaussian-based methods. For real-world scenes, GauFRe can train in ~20 +mins and offer 96 FPS real-time rendering on an RTX 3090 GPU. Project website: +https://lynl7130.github.io/gaufre/index.html + +
+
+ comment: WACV 2025. 11 pages, 8 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Key-Exchange Convolutional Auto-Encoder for Data Augmentation in Early + Knee Osteoarthritis Detection + + +
+ Knee Osteoarthritis (KOA) is a common musculoskeletal condition that +significantly affects mobility and quality of life, particularly in elderly +populations. However, training deep learning models for early KOA +classification is often hampered by the limited availability of annotated +medical datasets, owing to the high costs and labour-intensive nature of data +labelling. Traditional data augmentation techniques, while useful, rely on +simple transformations and fail to introduce sufficient diversity into the +dataset. To address these challenges, we propose the Key-Exchange Convolutional +Auto-Encoder (KECAE) as an innovative Artificial Intelligence (AI)-based data +augmentation strategy for early KOA classification. Our model employs a +convolutional autoencoder with a novel key-exchange mechanism that generates +synthetic images by selectively exchanging key pathological features between +X-ray images, which not only diversifies the dataset but also ensures the +clinical validity of the augmented data. A hybrid loss function is introduced +to supervise feature learning and reconstruction, integrating multiple +components, including reconstruction, supervision, and feature separation +losses. Experimental results demonstrate that the KECAE-generated data +significantly improve the performance of KOA classification models, with +accuracy gains of up to 1.98% across various standard and state-of-the-art +architectures. Furthermore, a clinical validation study involving expert +radiologists confirms the anatomical plausibility and diagnostic realism of the +synthetic outputs. These findings highlight the potential of KECAE as a robust +tool for augmenting medical datasets in early KOA detection. + +
+
+
+
+
+ + ♻ ☆ Human Activity Recognition in an Open World + + +
+ Managing novelty in perception-based human activity recognition (HAR) is +critical in realistic settings to improve task performance over time and ensure +solution generalization outside of prior seen samples. Novelty manifests in HAR +as unseen samples, activities, objects, environments, and sensor changes, among +other ways. Novelty may be task-relevant, such as a new class or new features, +or task-irrelevant resulting in nuisance novelty, such as never before seen +noise, blur, or distorted video recordings. To perform HAR optimally, +algorithmic solutions must be tolerant to nuisance novelty, and learn over time +in the face of novelty. This paper 1) formalizes the definition of novelty in +HAR building upon the prior definition of novelty in classification tasks, 2) +proposes an incremental open world learning (OWL) protocol and applies it to +the Kinetics datasets to generate a new benchmark KOWL-718, 3) analyzes the +performance of current state-of-the-art HAR models when novelty is introduced +over time, 4) provides a containerized and packaged pipeline for reproducing +the OWL protocol and for modifying for any future updates to Kinetics. The +experimental analysis includes an ablation study of how the different models +perform under various conditions as annotated by Kinetics-AVA. The protocol as +an algorithm for reproducing experiments using the KOWL-718 benchmark will be +publicly released with code and containers at +https://github.com/prijatelj/human-activity-recognition-in-an-open-world. The +code may be used to analyze different annotations and subsets of the Kinetics +datasets in an incremental open world fashion, as well as be extended as +further updates to Kinetics are released. + +
+
+ comment: 37 pages, 16 figures, 3 tables. Published in JAIR 81 on Dec 20, 2024. + All author affiliations are from during the paper's original funded work. + Updated info and current emails are provided in this version's first page +
+
+
+
+
+ + ♻ ☆ Confidence-Driven Deep Learning Framework for Early Detection of Knee + Osteoarthritis + + +
+ Knee Osteoarthritis (KOA) is a prevalent musculoskeletal disorder that +severely impacts mobility and quality of life, particularly among older adults. +Its diagnosis often relies on subjective assessments using the +Kellgren-Lawrence (KL) grading system, leading to variability in clinical +evaluations. To address these challenges, we propose a confidence-driven deep +learning framework for early KOA detection, focusing on distinguishing KL-0 and +KL-2 stages. The Siamese-based framework integrates a novel multi-level feature +extraction architecture with a hybrid loss strategy. Specifically, multi-level +Global Average Pooling (GAP) layers are employed to extract features from +varying network depths, ensuring comprehensive feature representation, while +the hybrid loss strategy partitions training samples into high-, medium-, and +low-confidence subsets. Tailored loss functions are applied to improve model +robustness and effectively handle uncertainty in annotations. Experimental +results on the Osteoarthritis Initiative (OAI) dataset demonstrate that the +proposed framework achieves competitive accuracy, sensitivity, and specificity, +comparable to those of expert radiologists. Cohen's kappa values (k > 0.85)) +confirm substantial agreement, while McNemar's test (p > 0.05) indicates no +statistically significant differences between the model and radiologists. +Additionally, Confidence distribution analysis reveals that the model emulates +radiologists' decision-making patterns. These findings highlight the potential +of the proposed approach to serve as an auxiliary diagnostic tool, enhancing +early KOA detection and reducing clinical workload. + +
+
+
+
+
+ + ♻ ☆ Enhancing Novel Object Detection via Cooperative Foundational Models + + +
+ In this work, we address the challenging and emergent problem of novel object +detection (NOD), focusing on the accurate detection of both known and novel +object categories during inference. Traditional object detection algorithms are +inherently closed-set, limiting their capability to handle NOD. We present a +novel approach to transform existing closed-set detectors into open-set +detectors. This transformation is achieved by leveraging the complementary +strengths of pre-trained foundational models, specifically CLIP and SAM, +through our cooperative mechanism. Furthermore, by integrating this mechanism +with state-of-the-art open-set detectors such as GDINO, we establish new +benchmarks in object detection performance. Our method achieves 17.42 mAP in +novel object detection and 42.08 mAP for known objects on the challenging LVIS +dataset. Adapting our approach to the COCO OVD split, we surpass the current +state-of-the-art by a margin of 7.2 $ \text{AP}_{50} $ for novel classes. Our +code is available at https://rohit901.github.io/coop-foundation-models/ . + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ♻ ☆ MVTamperBench: Evaluating Robustness of Vision-Language Models + + +
+ Recent advancements in Vision-Language Models (VLMs) have enabled significant +progress in complex video understanding tasks. However, their robustness to +real-world manipulations remains underexplored, limiting their reliability in +critical applications. To address this gap, we introduce MVTamperBench, a +comprehensive benchmark designed to evaluate VLM's resilience to video +tampering effects, including rotation, dropping, masking, substitution, and +repetition. By systematically assessing state-of-the-art models, MVTamperBench +reveals substantial variability in robustness, with models like InternVL2-8B +achieving high performance, while others, such as Llama-VILA1.5-8B, exhibit +severe vulnerabilities. To foster broader adoption and reproducibility, +MVTamperBench is integrated into VLMEvalKit, a modular evaluation toolkit, +enabling streamlined testing and facilitating advancements in model robustness. +Our benchmark represents a critical step towards developing tamper-resilient +VLMs, ensuring their dependability in real-world scenarios. + Project Page: https://amitbcp.github.io/MVTamperBench/ + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 29 + +
+
+
+ + ☆ VINGS-Mono: Visual-Inertial Gaussian Splatting Monocular SLAM in Large + Scenes + + +
+ VINGS-Mono is a monocular (inertial) Gaussian Splatting (GS) SLAM framework +designed for large scenes. The framework comprises four main components: VIO +Front End, 2D Gaussian Map, NVS Loop Closure, and Dynamic Eraser. In the VIO +Front End, RGB frames are processed through dense bundle adjustment and +uncertainty estimation to extract scene geometry and poses. Based on this +output, the mapping module incrementally constructs and maintains a 2D Gaussian +map. Key components of the 2D Gaussian Map include a Sample-based Rasterizer, +Score Manager, and Pose Refinement, which collectively improve mapping speed +and localization accuracy. This enables the SLAM system to handle large-scale +urban environments with up to 50 million Gaussian ellipsoids. To ensure global +consistency in large-scale scenes, we design a Loop Closure module, which +innovatively leverages the Novel View Synthesis (NVS) capabilities of Gaussian +Splatting for loop closure detection and correction of the Gaussian map. +Additionally, we propose a Dynamic Eraser to address the inevitable presence of +dynamic objects in real-world outdoor scenes. Extensive evaluations in indoor +and outdoor environments demonstrate that our approach achieves localization +performance on par with Visual-Inertial Odometry while surpassing recent +GS/NeRF SLAM methods. It also significantly outperforms all existing methods in +terms of mapping and rendering quality. Furthermore, we developed a mobile app +and verified that our framework can generate high-quality Gaussian maps in real +time using only a smartphone camera and a low-frequency IMU sensor. To the best +of our knowledge, VINGS-Mono is the first monocular Gaussian SLAM method +capable of operating in outdoor environments and supporting kilometer-scale +large scenes. + +
+
+
+
+
+ + ☆ FDPP: Fine-tune Diffusion Policy with Human Preference + + +
+ Imitation learning from human demonstrations enables robots to perform +complex manipulation tasks and has recently witnessed huge success. However, +these techniques often struggle to adapt behavior to new preferences or changes +in the environment. To address these limitations, we propose Fine-tuning +Diffusion Policy with Human Preference (FDPP). FDPP learns a reward function +through preference-based learning. This reward is then used to fine-tune the +pre-trained policy with reinforcement learning (RL), resulting in alignment of +pre-trained policy with new human preferences while still solving the original +task. Our experiments across various robotic tasks and preferences demonstrate +that FDPP effectively customizes policy behavior without compromising +performance. Additionally, we show that incorporating Kullback-Leibler (KL) +regularization during fine-tuning prevents over-fitting and helps maintain the +competencies of the initial policy. + +
+
+
+
+
+ + ☆ Data-driven Spatial Classification using Multi-Arm Bandits for + Monitoring with Energy-Constrained Mobile Robots + + +
+ We consider the spatial classification problem for monitoring using data +collected by a coordinated team of mobile robots. Such classification problems +arise in several applications including search-and-rescue and precision +agriculture. Specifically, we want to classify the regions of a search +environment into interesting and uninteresting as quickly as possible using a +team of mobile sensors and mobile charging stations. We develop a data-driven +strategy that accommodates the noise in sensed data and the limited energy +capacity of the sensors, and generates collision-free motion plans for the +team. We propose a bi-level approach, where a high-level planner leverages a +multi-armed bandit framework to determine the potential regions of interest for +the drones to visit next based on the data collected online. Then, a low-level +path planner based on integer programming coordinates the paths for the team to +visit the target regions subject to the physical constraints. We characterize +several theoretical properties of the proposed approach, including anytime +guarantees and task completion time. We show the efficacy of our approach in +simulation, and further validate these observations in physical experiments +using mobile robots. + +
+
+ comment: 8 pages, 6 figures. See https://www.youtube.com/watch?v=gzulpOcVYzg + for an overview of the approach along with videos of the hardware experiments +
+
+
+
+
+ + ☆ Hybrid Action Based Reinforcement Learning for Multi-Objective + Compatible Autonomous Driving + + +
+ Reinforcement Learning (RL) has shown excellent performance in solving +decision-making and control problems of autonomous driving, which is +increasingly applied in diverse driving scenarios. However, driving is a +multi-attribute problem, leading to challenges in achieving multi-objective +compatibility for current RL methods, especially in both policy execution and +policy iteration. On the one hand, the common action space structure with +single action type limits driving flexibility or results in large behavior +fluctuations during policy execution. On the other hand, the multi-attribute +weighted single reward function result in the agent's disproportionate +attention to certain objectives during policy iterations. To this end, we +propose a Multi-objective Ensemble-Critic reinforcement learning method with +Hybrid Parametrized Action for multi-objective compatible autonomous driving. +Specifically, a parameterized action space is constructed to generate hybrid +driving actions, combining both abstract guidance and concrete control +commands. A multi-objective critics architecture is constructed considering +multiple attribute rewards, to ensure simultaneously focusing on different +driving objectives. Additionally, uncertainty-based exploration strategy is +introduced to help the agent faster approach viable driving policy. The +experimental results in both the simulated traffic environment and the HighD +dataset demonstrate that our method can achieve multi-objective compatible +autonomous driving in terms of driving efficiency, action consistency, and +safety. It enhances the general performance of the driving while significantly +increasing training efficiency. + +
+
+ comment: 12 pages, 9 figures, 5 tables +
+
+
+
+
+ + ☆ HydroelasticTouch: Simulation of Tactile Sensors with Hydroelastic + Contact Surfaces + + +
+ Thanks to recent advancements in the development of inexpensive, +high-resolution tactile sensors, touch sensing has become popular in +contact-rich robotic manipulation tasks. With the surge of data-driven methods +and their requirement for substantial datasets, several methods of simulating +tactile sensors have emerged in the tactile research community to overcome +real-world data collection limitations. These simulation approaches can be +split into two main categories: fast but inaccurate (soft) point-contact models +and slow but accurate finite element modeling. In this work, we present a novel +approach to simulating pressure-based tactile sensors using the hydroelastic +contact model, which provides a high degree of physical realism at a reasonable +computational cost. This model produces smooth contact forces for soft-to-soft +and soft-to-rigid contacts along even non-convex contact surfaces. Pressure +values are approximated at each point of the contact surface and can be +integrated to calculate sensor outputs. We validate our models' capacity to +synthesize real-world tactile data by conducting zero-shot sim-to-real transfer +of a model for object state estimation. Our simulation is available as a +plug-in to our open-source, MuJoCo-based simulator. + +
+
+
+
+
+ + ☆ CHEQ-ing the Box: Safe Variable Impedance Learning for Robotic Polishing + + +
+ Robotic systems are increasingly employed for industrial automation, with +contact-rich tasks like polishing requiring dexterity and compliant behaviour. +These tasks are difficult to model, making classical control challenging. Deep +reinforcement learning (RL) offers a promising solution by enabling the +learning of models and control policies directly from data. However, its +application to real-world problems is limited by data inefficiency and unsafe +exploration. Adaptive hybrid RL methods blend classical control and RL +adaptively, combining the strengths of both: structure from control and +learning from RL. This has led to improvements in data efficiency and +exploration safety. However, their potential for hardware applications remains +underexplored, with no evaluations on physical systems to date. Such +evaluations are critical to fully assess the practicality and effectiveness of +these methods in real-world settings. This work presents an experimental +demonstration of the hybrid RL algorithm CHEQ for robotic polishing with +variable impedance, a task requiring precise force and velocity tracking. In +simulation, we show that variable impedance enhances polishing performance. We +compare standalone RL with adaptive hybrid RL, demonstrating that CHEQ achieves +effective learning while adhering to safety constraints. On hardware, CHEQ +achieves effective polishing behaviour, requiring only eight hours of training +and incurring just five failures. These results highlight the potential of +adaptive hybrid RL for real-world, contact-rich tasks trained directly on +hardware. + +
+
+
+
+
+ + ☆ AI Guide Dog: Egocentric Path Prediction on Smartphone + + +
+ This paper introduces AI Guide Dog (AIGD), a lightweight egocentric +navigation assistance system for visually impaired individuals, designed for +real-time deployment on smartphones. AIGD addresses key challenges in blind +navigation by employing a vision-only, multi-label classification approach to +predict directional commands, ensuring safe traversal across diverse +environments. We propose a novel technique to enable goal-based outdoor +navigation by integrating GPS signals and high-level directions, while also +addressing uncertain multi-path predictions for destination-free indoor +navigation. Our generalized model is the first navigation assistance system to +handle both goal-oriented and exploratory navigation scenarios across indoor +and outdoor settings, establishing a new state-of-the-art in blind navigation. +We present methods, datasets, evaluations, and deployment insights to encourage +further innovations in assistive navigation systems. + +
+
+
+
+
+ + ☆ Low-Contact Grasping of Soft Tissue with Complex Geometry using a Vortex + Gripper + + +
+ Soft tissue manipulation is an integral aspect of most surgical procedures; +however, the vast majority of surgical graspers used today are made of hard +materials, such as metals or hard plastics. Furthermore, these graspers +predominately function by pinching tissue between two hard objects as a method +for tissue manipulation. As such, the potential to apply too much force during +contact, and thus damage tissue, is inherently high. As an alternative +approach, gaspers developed using a pneumatic vortex could potentially levitate +soft tissue, enabling manipulation with low or even no contact force. In this +paper, we present the design and well as a full factorial study of the force +characteristics of the vortex gripper grasping soft surfaces with four common +shapes, with convex and concave curvature, and ranging over 10 different radii +of curvature, for a total of 40 unique surfaces. By changing the parameters of +the nozzle elements in the design of the gripper, it was possible to +investigate the influence of the mass flow parameters of the vortex gripper on +the lifting force for all of these different soft surfaces. An $\pmb{ex}$ +$\pmb{vivo}$ experiment was conducted on grasping biological tissues and soft +balls of various shapes to show the advantages and disadvantages of the +proposed technology. The obtained results allowed us to find limitations in the +use of vortex technology and the following stages of its improvement for +medical use. + +
+
+ comment: Submitted to T-MRB +
+
+
+
+
+ + ☆ Electrostatic Clutches Enable High-Force Mechanical Multiplexing: + Demonstrating Single-Motor Full-Actuation of a 4-DoF Hand + + +
+ This paper introduces a novel mechanical multiplexing system powered by +electrostatic capstan clutches, enabling high-force, single-motor control of +multiple degrees of freedom (DoF). The system is capable of both bidirectional +single-input single-output time-division and single-input multiple-output +multiplexing to actuate a commercial 4-DoF robotic hand with a single motor. +Our mechanical multiplexer is also capable of powerless position holding owing +to its use of a leadscrew nut acting as the output. Experimental results +demonstrate the effectiveness of this approach, achieving individual and +simultaneous actuation. This innovation offers a scalable solution for high-DoF +robotic systems, providing a path to efficient actuation in robotic platforms. + +
+
+
+
+
+ + ☆ Toward Zero-Shot User Intent Recognition in Shared Autonomy + + +
+ A fundamental challenge of shared autonomy is to use high-DoF robots to +assist, rather than hinder, humans by first inferring user intent and then +empowering the user to achieve their intent. Although successful, prior methods +either rely heavily on a priori knowledge of all possible human intents or +require many demonstrations and interactions with the human to learn these +intents before being able to assist the user. We propose and study a zero-shot, +vision-only shared autonomy (VOSA) framework designed to allow robots to use +end-effector vision to estimate zero-shot human intents in conjunction with +blended control to help humans accomplish manipulation tasks with unknown and +dynamically changing object locations. To demonstrate the effectiveness of our +VOSA framework, we instantiate a simple version of VOSA on a Kinova Gen3 +manipulator and evaluate our system by conducting a user study on three +tabletop manipulation tasks. The performance of VOSA matches that of an oracle +baseline model that receives privileged knowledge of possible human intents +while also requiring significantly less effort than unassisted teleoperation. +In more realistic settings, where the set of possible human intents is fully or +partially unknown, we demonstrate that VOSA requires less human effort and time +than baseline approaches while being preferred by a majority of the +participants. Our results demonstrate the efficacy and efficiency of using +off-the-shelf vision algorithms to enable flexible and beneficial shared +control of a robot manipulator. Code and videos available here: +https://sites.google.com/view/zeroshot-sharedautonomy/home. + +
+
+ comment: 10 pages, 6 figures, Accepted to IEEE/ACM International Conference on + Human-Robot Interaction (HRI), 2025. Equal Contribution from the first three + authors +
+
+
+
+
+ + ♻ ☆ FaVoR: Features via Voxel Rendering for Camera Relocalization + + +
+ Camera relocalization methods range from dense image alignment to direct +camera pose regression from a query image. Among these, sparse feature matching +stands out as an efficient, versatile, and generally lightweight approach with +numerous applications. However, feature-based methods often struggle with +significant viewpoint and appearance changes, leading to matching failures and +inaccurate pose estimates. To overcome this limitation, we propose a novel +approach that leverages a globally sparse yet locally dense 3D representation +of 2D features. By tracking and triangulating landmarks over a sequence of +frames, we construct a sparse voxel map optimized to render image patch +descriptors observed during tracking. Given an initial pose estimate, we first +synthesize descriptors from the voxels using volumetric rendering and then +perform feature matching to estimate the camera pose. This methodology enables +the generation of descriptors for unseen views, enhancing robustness to view +changes. We extensively evaluate our method on the 7-Scenes and Cambridge +Landmarks datasets. Our results show that our method significantly outperforms +existing state-of-the-art feature representation techniques in indoor +environments, achieving up to a 39% improvement in median translation error. +Additionally, our approach yields comparable results to other methods for +outdoor scenarios while maintaining lower memory and computational costs. + +
+
+ comment: Accepted to the IEEE/CVF Winter Conference on Applications of + Computer Vision (WACV), Tucson, Arizona, US, Feb 28-Mar 4, 2025 +
+
+
+
+
+ + ♻ ☆ Vid2Sim: Realistic and Interactive Simulation from Video for Urban + Navigation + + +
+ Sim-to-real gap has long posed a significant challenge for robot learning in +simulation, preventing the deployment of learned models in the real world. +Previous work has primarily focused on domain randomization and system +identification to mitigate this gap. However, these methods are often limited +by the inherent constraints of the simulation and graphics engines. In this +work, we propose Vid2Sim, a novel framework that effectively bridges the +sim2real gap through a scalable and cost-efficient real2sim pipeline for neural +3D scene reconstruction and simulation. Given a monocular video as input, +Vid2Sim can generate photorealistic and physically interactable 3D simulation +environments to enable the reinforcement learning of visual navigation agents +in complex urban environments. Extensive experiments demonstrate that Vid2Sim +significantly improves the performance of urban navigation in the digital twins +and real world by 31.2% and 68.3% in success rate compared with agents trained +with prior simulation methods. + +
+
+ comment: Project page: https://metadriverse.github.io/vid2sim/ +
+
+
+
+
+ + ♻ ☆ Virtual Reflections on a Dynamic 2D Eye Model Improve Spatial Reference + Identification + + +
+ The visible orientation of human eyes creates some transparency about +people's spatial attention and other mental states. This leads to a dual role +for the eyes as a means of sensing and communication. Accordingly, artificial +eye models are being explored as communication media in human-machine +interaction scenarios. One challenge in the use of eye models for communication +consists of resolving spatial reference ambiguities, especially for +screen-based models. Here, we introduce an approach for overcoming this +challenge through the introduction of reflection-like features that are +contingent on artificial eye movements. We conducted a user study with 30 +participants in which participants had to use spatial references provided by +dynamic eye models to advance in a fast-paced group interaction task. Compared +to a non-reflective eye model and a pure reflection mode, their combination in +the new approach resulted in a higher identification accuracy and user +experience, suggesting a synergistic benefit. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ♻ ☆ GenSafe: A Generalizable Safety Enhancer for Safe Reinforcement Learning + Algorithms Based on Reduced Order Markov Decision Process Model + + +
+ Safe Reinforcement Learning (SRL) aims to realize a safe learning process for +Deep Reinforcement Learning (DRL) algorithms by incorporating safety +constraints. However, the efficacy of SRL approaches often relies on accurate +function approximations, which are notably challenging to achieve in the early +learning stages due to data insufficiency. To address this issue, we introduce +in this work a novel Generalizable Safety enhancer (GenSafe) that is able to +overcome the challenge of data insufficiency and enhance the performance of SRL +approaches. Leveraging model order reduction techniques, we first propose an +innovative method to construct a Reduced Order Markov Decision Process (ROMDP) +as a low-dimensional approximator of the original safety constraints. Then, by +solving the reformulated ROMDP-based constraints, GenSafe refines the actions +of the agent to increase the possibility of constraint satisfaction. +Essentially, GenSafe acts as an additional safety layer for SRL algorithms. We +evaluate GenSafe on multiple SRL approaches and benchmark problems. The results +demonstrate its capability to improve safety performance, especially in the +early learning phases, while maintaining satisfactory task performance. Our +proposed GenSafe not only offers a novel measure to augment existing SRL +methods but also shows broad compatibility with various SRL algorithms, making +it applicable to a wide range of systems and SRL problems. + +
+
+
+
+
+ + ♻ ☆ Perception Matters: Enhancing Embodied AI with Uncertainty-Aware + Semantic Segmentation + + +
+ Embodied AI has made significant progress acting in unexplored environments. +However, tasks such as object search have largely focused on efficient policy +learning. In this work, we identify several gaps in current search methods: +They largely focus on dated perception models, neglect temporal aggregation, +and transfer from ground truth directly to noisy perception at test time, +without accounting for the resulting overconfidence in the perceived state. We +address the identified problems through calibrated perception probabilities and +uncertainty across aggregation and found decisions, thereby adapting the models +for sequential tasks. The resulting methods can be directly integrated with +pretrained models across a wide family of existing search approaches at no +additional training cost. We perform extensive evaluations of aggregation +methods across both different semantic perception models and policies, +confirming the importance of calibrated uncertainties in both the aggregation +and found decisions. We make the code and trained models available at +https://semantic-search.cs.uni-freiburg.de. + +
+
+
+
+
+ + ♻ ☆ DIDLM: A SLAM Dataset for Difficult Scenarios Featuring Infrared, Depth + Cameras, LIDAR, 4D Radar, and Others under Adverse Weather, Low Light + Conditions, and Rough Roads + + +
+ Adverse weather conditions, low-light environments, and bumpy road surfaces +pose significant challenges to SLAM in robotic navigation and autonomous +driving. Existing datasets in this field predominantly rely on single sensors +or combinations of LiDAR, cameras, and IMUs. However, 4D millimeter-wave radar +demonstrates robustness in adverse weather, infrared cameras excel in capturing +details under low-light conditions, and depth images provide richer spatial +information. Multi-sensor fusion methods also show potential for better +adaptation to bumpy roads. Despite some SLAM studies incorporating these +sensors and conditions, there remains a lack of comprehensive datasets +addressing low-light environments and bumpy road conditions, or featuring a +sufficiently diverse range of sensor data. In this study, we introduce a +multi-sensor dataset covering challenging scenarios such as snowy weather, +rainy weather, nighttime conditions, speed bumps, and rough terrains. The +dataset includes rarely utilized sensors for extreme conditions, such as 4D +millimeter-wave radar, infrared cameras, and depth cameras, alongside 3D LiDAR, +RGB cameras, GPS, and IMU. It supports both autonomous driving and ground robot +applications and provides reliable GPS/INS ground truth data, covering +structured and semi-structured terrains. We evaluated various SLAM algorithms +using this dataset, including RGB images, infrared images, depth images, LiDAR, +and 4D millimeter-wave radar. The dataset spans a total of 18.5 km, 69 minutes, +and approximately 660 GB, offering a valuable resource for advancing SLAM +research under complex and extreme conditions. Our dataset is available at +https://github.com/GongWeiSheng/DIDLM. + +
+
+
+
+
+ + ♻ ☆ Evaluation of Artificial Intelligence Methods for Lead Time Prediction + in Non-Cycled Areas of Automotive Production + + +
+ The present study examines the effectiveness of applying Artificial +Intelligence methods in an automotive production environment to predict unknown +lead times in a non-cycle-controlled production area. Data structures are +analyzed to identify contextual features and then preprocessed using one-hot +encoding. Methods selection focuses on supervised machine learning techniques. +In supervised learning methods, regression and classification methods are +evaluated. Continuous regression based on target size distribution is not +feasible. Classification methods analysis shows that Ensemble Learning and +Support Vector Machines are the most suitable. Preliminary study results +indicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost +yield the best results. After further testing and extensive hyperparameter +optimization, the final method choice is the LightGBM algorithm. Depending on +feature availability and prediction interval granularity, relative prediction +accuracies of up to 90% can be achieved. Further tests highlight the importance +of periodic retraining of AI models to accurately represent complex production +processes using the database. The research demonstrates that AI methods can be +effectively applied to highly variable production data, adding business value +by providing an additional metric for various control tasks while outperforming +current non AI-based systems. + +
+
+
+
+
+ + ♻ ☆ GazeGrasp: DNN-Driven Robotic Grasping with Wearable Eye-Gaze Interface + + +
+ We present GazeGrasp, a gaze-based manipulation system enabling individuals +with motor impairments to control collaborative robots using eye-gaze. The +system employs an ESP32 CAM for eye tracking, MediaPipe for gaze detection, and +YOLOv8 for object localization, integrated with a Universal Robot UR10 for +manipulation tasks. After user-specific calibration, the system allows +intuitive object selection with a magnetic snapping effect and robot control +via eye gestures. Experimental evaluation involving 13 participants +demonstrated that the magnetic snapping effect significantly reduced gaze +alignment time, improving task efficiency by 31%. GazeGrasp provides a robust, +hands-free interface for assistive robotics, enhancing accessibility and +autonomy for users. + +
+
+ comment: Accepted to: IEEE/ACM International Conference on Human-Robot + Interaction (HRI 2025) +
+
+
+
+
+ + ♻ ☆ Cooperative Aerial Robot Inspection Challenge: A Benchmark for + Heterogeneous Multi-UAV Planning and Lessons Learned + + +
+ We propose the Cooperative Aerial Robot Inspection Challenge (CARIC), a +simulation-based benchmark for motion planning algorithms in heterogeneous +multi-UAV systems. CARIC features UAV teams with complementary sensors, +realistic constraints, and evaluation metrics prioritizing inspection quality +and efficiency. It offers a ready-to-use perception-control software stack and +diverse scenarios to support the development and evaluation of task allocation +and motion planning algorithms. Competitions using CARIC were held at IEEE CDC +2023 and the IROS 2024 Workshop on Multi-Robot Perception and Navigation, +attracting innovative solutions from research teams worldwide. This paper +examines the top three teams from CDC 2023, analyzing their exploration, +inspection, and task allocation strategies while drawing insights into their +performance across scenarios. The results highlight the task's complexity and +suggest promising directions for future research in cooperative multi-UAV +systems. + +
+
+ comment: Please find our website at https://ntu-aris.github.io/caric +
+
+
+
+
+ + ♻ ☆ Analyzing Infrastructure LiDAR Placement with Realistic LiDAR Simulation + Library ICRA'23 + + +
+ Recently, Vehicle-to-Everything(V2X) cooperative perception has attracted +increasing attention. Infrastructure sensors play a critical role in this +research field; however, how to find the optimal placement of infrastructure +sensors is rarely studied. In this paper, we investigate the problem of +infrastructure sensor placement and propose a pipeline that can efficiently and +effectively find optimal installation positions for infrastructure sensors in a +realistic simulated environment. To better simulate and evaluate LiDAR +placement, we establish a Realistic LiDAR Simulation library that can simulate +the unique characteristics of different popular LiDARs and produce +high-fidelity LiDAR point clouds in the CARLA simulator. Through simulating +point cloud data in different LiDAR placements, we can evaluate the perception +accuracy of these placements using multiple detection models. Then, we analyze +the correlation between the point cloud distribution and perception accuracy by +calculating the density and uniformity of regions of interest. Experiments show +that when using the same number and type of LiDAR, the placement scheme +optimized by our proposed method improves the average precision by 15%, +compared with the conventional placement scheme in the standard lane scene. We +also analyze the correlation between perception performance in the region of +interest and LiDAR point cloud distribution and validate that density and +uniformity can be indicators of performance. Both the RLS Library and related +code will be released at https://github.com/PJLab-ADG/PCSim. + +
+
+ comment: 7 pages, 6 figures, accepted to the IEEE International Conference on + Robotics and Automation (ICRA'23) +
+
+
+
+
+ + ♻ ☆ Cost-Effective Robotic Handwriting System with AI Integration + + +
+ This paper introduces a cost-effective robotic handwriting system designed to +replicate human-like handwriting with high precision. Combining a Raspberry Pi +Pico microcontroller, 3D-printed components, and a machine learning-based +handwriting generation model implemented via TensorFlow, the system converts +user-supplied text into realistic stroke trajectories. By leveraging +lightweight 3D-printed materials and efficient mechanical designs, the system +achieves a total hardware cost of approximately \$56, significantly +undercutting commercial alternatives. Experimental evaluations demonstrate +handwriting precision within $\pm$0.3 millimeters and a writing speed of +approximately 200 mm/min, positioning the system as a viable solution for +educational, research, and assistive applications. This study seeks to lower +the barriers to personalized handwriting technologies, making them accessible +to a broader audience. + +
+
+ comment: This is an updated version of a paper originally presented at the + 2024 IEEE Long Island Systems, Applications and Technology Conference (LISAT) +
+
+
+
+
+ + ♻ ☆ Tactile-based Exploration, Mapping and Navigation with + Collision-Resilient Aerial Vehicles + + +
+ This article introduces XPLORER, a passive deformable UAV with a +spring-augmented chassis and proprioceptive state awareness, designed to endure +collisions and maintain smooth contact. We develop a fast-converging external +force estimation algorithm for XPLORER that leverages onboard sensors and +proprioceptive data for contact and collision detection. Using this force +information, we propose four motion primitives, including three novel +tactile-based primitives: tactile-traversal, tactile-turning, and +ricocheting-to aid XPLORER in navigating unknown environments. These primitives +are synthesized autonomously in real-time to enable efficient exploration and +navigation by leveraging collisions and contacts. Experimental results +demonstrate the effectiveness of our approach, highlighting the potential of +passive deformable UAVs for contact-rich real-world tasks such as +non-destructive inspection, surveillance and mapping, and pursuit/evasion. + +
+
+
+
+
+ + ♻ ☆ Safety Implications of Explainable Artificial Intelligence in End-to-End + Autonomous Driving + + +
+ The end-to-end learning pipeline is gradually creating a paradigm shift in +the ongoing development of highly autonomous vehicles, largely due to advances +in deep learning, the availability of large-scale training datasets, and +improvements in integrated sensor devices. However, a lack of explainability in +real-time decisions with contemporary learning methods impedes user trust and +attenuates the widespread deployment and commercialization of such vehicles. +Moreover, the issue is exacerbated when these cars are involved in or cause +traffic accidents. Consequently, explainability in end-to-end autonomous +driving is essential to build trust in vehicular automation. With that said, +automotive researchers have not yet rigorously explored safety benefits and +consequences of explanations in end-to-end autonomous driving. This paper aims +to bridge the gaps between these topics and seeks to answer the following +research question: What are safety implications of explanations in end-to-end +autonomous driving? In this regard, we first revisit established safety and +explainability concepts in end-to-end driving. Furthermore, we present three +critical case studies and show the pivotal role of explanations in enhancing +self-driving safety. Finally, we describe insights from empirical studies and +reveal potential value, limitations, and caveats of practical explainable AI +methods with respect to their safety assurance in end-to-end driving. + +
+
+
+
+
+ + ♻ ☆ Cooperative and Asynchronous Transformer-based Mission Planning for + Heterogeneous Teams of Mobile Robots + + +
+ Cooperative mission planning for heterogeneous teams of mobile robots +presents a unique set of challenges, particularly when operating under +communication constraints and limited computational resources. To address these +challenges, we propose the Cooperative and Asynchronous Transformer-based +Mission Planning (CATMiP) framework, which leverages multi-agent reinforcement +learning (MARL) to coordinate distributed decision making among agents with +diverse sensing, motion, and actuation capabilities, operating under sporadic +ad hoc communication. A Class-based Macro-Action Decentralized Partially +Observable Markov Decision Process (CMacDec-POMDP) is also formulated to +effectively model asynchronous decision-making for heterogeneous teams of +agents. The framework utilizes an asynchronous centralized training and +distributed execution scheme that is developed based on the Multi-Agent +Transformer (MAT) architecture. This design allows a single trained model to +generalize to larger environments and accommodate varying team sizes and +compositions. We evaluate CATMiP in a 2D grid-world simulation environment and +compare its performance against planning-based exploration methods. Results +demonstrate CATMiP's superior efficiency, scalability, and robustness to +communication dropouts, highlighting its potential for real-world heterogeneous +mobile robot systems. The code is available at +https://github.com/mylad13/CATMiP. + +
+
+ comment: 27 pages, 8 figures, this work has been submitted to Elsevier for + possible publication +
+
+
+
+
+ + ♻ ☆ A Signal Temporal Logic Approach for Task-Based Coordination of + Multi-Aerial Systems: a Wind Turbine Inspection Case Study + + +
+ The paper addresses task assignment and trajectory generation for +collaborative inspection missions using a fleet of multi-rotors, focusing on +the wind turbine inspection scenario. The proposed solution enables safe and +feasible trajectories while accommodating heterogeneous time-bound constraints +and vehicle physical limits. An optimization problem is formulated to meet +mission objectives and temporal requirements encoded as Signal Temporal Logic +(STL) specifications. Additionally, an event-triggered replanner is introduced +to address unforeseen events and compensate for lost time. Furthermore, a +generalized robustness scoring method is employed to reflect user preferences +and mitigate task conflicts. The effectiveness of the proposed approach is +demonstrated through MATLAB and Gazebo simulations, as well as field +multi-robot experiments in a mock-up scenario. + +
+
+ comment: \c{opyright}2025 Elsevier. This work has been accepted to "Robotics + and Autonomous Systems" for possible publication. Personal use of this + material is permitted. Permission from Elsevier must be obtained for all + other uses +
+
+
+
+
+ + ♻ ☆ Beyond Sight: Finetuning Generalist Robot Policies with Heterogeneous + Sensors via Language Grounding + + +
+ Interacting with the world is a multi-sensory experience: achieving effective +general-purpose interaction requires making use of all available modalities -- +including vision, touch, and audio -- to fill in gaps from partial observation. +For example, when vision is occluded reaching into a bag, a robot should rely +on its senses of touch and sound. However, state-of-the-art generalist robot +policies are typically trained on large datasets to predict robot actions +solely from visual and proprioceptive observations. In this work, we propose +FuSe, a novel approach that enables finetuning visuomotor generalist policies +on heterogeneous sensor modalities for which large datasets are not readily +available by leveraging natural language as a common cross-modal grounding. We +combine a multimodal contrastive loss with a sensory-grounded language +generation loss to encode high-level semantics. In the context of robot +manipulation, we show that FuSe enables performing challenging tasks that +require reasoning jointly over modalities such as vision, touch, and sound in a +zero-shot setting, such as multimodal prompting, compositional cross-modal +prompting, and descriptions of objects it interacts with. We show that the same +recipe is applicable to widely different generalist policies, including both +diffusion-based generalist policies and large vision-language-action (VLA) +models. Extensive experiments in the real world show that FuSeis able to +increase success rates by over 20% compared to all considered baselines. + +
+
+
+
+
+ + ♻ ☆ Resilient Distributed Optimization for Multi-Agent Cyberphysical Systems + + +
+ This work focuses on the problem of distributed optimization in multi-agent +cyberphysical systems, where a legitimate agent's iterates are influenced both +by the values it receives from potentially malicious neighboring agents, and by +its own self-serving target function. We develop a new algorithmic and +analytical framework to achieve resilience for the class of problems where +stochastic values of trust between agents exist and can be exploited. In this +case, we show that convergence to the true global optimal point can be +recovered, both in mean and almost surely, even in the presence of malicious +agents. Furthermore, we provide expected convergence rate guarantees in the +form of upper bounds on the expected squared distance to the optimal value. +Finally, numerical results are presented that validate our analytical +convergence guarantees even when the malicious agents compose the majority of +agents in the network and where existing methods fail to converge to the +optimal nominal points. + +
+
+ comment: Accepted for publication in the IEEE Transactions on Automatic + Control +
+
+
+
+
+ + ♻ ☆ SYNAPSE: SYmbolic Neural-Aided Preference Synthesis Engine + + +
+ This paper addresses the problem of preference learning, which aims to align +robot behaviors through learning user specific preferences (e.g. "good +pull-over location") from visual demonstrations. Despite its similarity to +learning factual concepts (e.g. "red door"), preference learning is a +fundamentally harder problem due to its subjective nature and the paucity of +person-specific training data. We address this problem using a novel framework +called SYNAPSE, which is a neuro-symbolic approach designed to efficiently +learn preferential concepts from limited data. SYNAPSE represents preferences +as neuro-symbolic programs, facilitating inspection of individual parts for +alignment, in a domain-specific language (DSL) that operates over images and +leverages a novel combination of visual parsing, large language models, and +program synthesis to learn programs representing individual preferences. We +perform extensive evaluations on various preferential concepts as well as user +case studies demonstrating its ability to align well with dissimilar user +preferences. Our method significantly outperforms baselines, especially when it +comes to out of distribution generalization. We show the importance of the +design choices in the framework through multiple ablation studies. Code, +additional results, and supplementary material can be found on the website: +https://amrl.cs.utexas.edu/synapse + +
+
+ comment: Accepted (oral) at AAAI 25 +
+
+
+
+
+ + ♻ ☆ GestLLM: Advanced Hand Gesture Interpretation via Large Language Models + for Human-Robot Interaction + + +
+ This paper introduces GestLLM, an advanced system for human-robot interaction +that enables intuitive robot control through hand gestures. Unlike conventional +systems, which rely on a limited set of predefined gestures, GestLLM leverages +large language models and feature extraction via MediaPipe to interpret a +diverse range of gestures. This integration addresses key limitations in +existing systems, such as restricted gesture flexibility and the inability to +recognize complex or unconventional gestures commonly used in human +communication. + By combining state-of-the-art feature extraction and language model +capabilities, GestLLM achieves performance comparable to leading +vision-language models while supporting gestures underrepresented in +traditional datasets. For example, this includes gestures from popular culture, +such as the ``Vulcan salute" from Star Trek, without any additional +pretraining, prompt engineering, etc. This flexibility enhances the naturalness +and inclusivity of robot control, making interactions more intuitive and +user-friendly. + GestLLM provides a significant step forward in gesture-based interaction, +enabling robots to understand and respond to a wide variety of hand gestures +effectively. This paper outlines its design, implementation, and evaluation, +demonstrating its potential applications in advanced human-robot collaboration, +assistive robotics, and interactive entertainment. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 151 + +
+
+
+ + ☆ DAViD: Modeling Dynamic Affordance of 3D Objects using Pre-trained Video + Diffusion Models + + +
+ Understanding the ability of humans to use objects is crucial for AI to +improve daily life. Existing studies for learning such ability focus on +human-object patterns (e.g., contact, spatial relation, orientation) in static +situations, and learning Human-Object Interaction (HOI) patterns over time +(i.e., movement of human and object) is relatively less explored. In this +paper, we introduce a novel type of affordance named Dynamic Affordance. For a +given input 3D object mesh, we learn dynamic affordance which models the +distribution of both (1) human motion and (2) human-guided object pose during +interactions. As a core idea, we present a method to learn the 3D dynamic +affordance from synthetically generated 2D videos, leveraging a pre-trained +video diffusion model. Specifically, we propose a pipeline that first generates +2D HOI videos from the 3D object and then lifts them into 3D to generate 4D HOI +samples. Once we generate diverse 4D HOI samples on various target objects, we +train our DAViD, where we present a method based on the Low-Rank Adaptation +(LoRA) module for pre-trained human motion diffusion model (MDM) and an object +pose diffusion model with human pose guidance. Our motion diffusion model is +extended for multi-object interactions, demonstrating the advantage of our +pipeline with LoRA for combining the concepts of object usage. Through +extensive experiments, we demonstrate our DAViD outperforms the baselines in +generating human motion with HOIs. + +
+
+ comment: Project Page: https://snuvclab.github.io/david/ +
+
+
+
+
+ + ☆ MangaNinja: Line Art Colorization with Precise Reference Following + + +
+ Derived from diffusion models, MangaNinjia specializes in the task of +reference-guided line art colorization. We incorporate two thoughtful designs +to ensure precise character detail transcription, including a patch shuffling +module to facilitate correspondence learning between the reference color image +and the target line art, and a point-driven control scheme to enable +fine-grained color matching. Experiments on a self-collected benchmark +demonstrate the superiority of our model over current solutions in terms of +precise colorization. We further showcase the potential of the proposed +interactive point control in handling challenging cases, cross-character +colorization, multi-reference harmonization, beyond the reach of existing +algorithms. + +
+
+ comment: Project page and code: https://johanan528.github.io/MangaNinjia/ +
+
+
+
+
+ + ☆ Go-with-the-Flow: Motion-Controllable Video Diffusion Models Using + Real-Time Warped Noise + + +
+ Generative modeling aims to transform random noise into structured outputs. +In this work, we enhance video diffusion models by allowing motion control via +structured latent noise sampling. This is achieved by just a change in data: we +pre-process training videos to yield structured noise. Consequently, our method +is agnostic to diffusion model design, requiring no changes to model +architectures or training pipelines. Specifically, we propose a novel noise +warping algorithm, fast enough to run in real time, that replaces random +temporal Gaussianity with correlated warped noise derived from optical flow +fields, while preserving the spatial Gaussianity. The efficiency of our +algorithm enables us to fine-tune modern video diffusion base models using +warped noise with minimal overhead, and provide a one-stop solution for a wide +range of user-friendly motion control: local object motion control, global +camera movement control, and motion transfer. The harmonization between +temporal coherence and spatial Gaussianity in our warped noise leads to +effective motion control while maintaining per-frame pixel quality. Extensive +experiments and user studies demonstrate the advantages of our method, making +it a robust and scalable approach for controlling motion in video diffusion +models. Video results are available on our webpage: +https://vgenai-netflix-eyeline-research.github.io/Go-with-the-Flow/; source +code and model checkpoints are available on GitHub: +https://github.com/VGenAI-Netflix-Eyeline-Research/Go-with-the-Flow. + +
+
+
+
+
+ + ☆ Predicting 4D Hand Trajectory from Monocular Videos + + +
+ We present HaPTIC, an approach that infers coherent 4D hand trajectories from +monocular videos. Current video-based hand pose reconstruction methods +primarily focus on improving frame-wise 3D pose using adjacent frames rather +than studying consistent 4D hand trajectories in space. Despite the additional +temporal cues, they generally underperform compared to image-based methods due +to the scarcity of annotated video data. To address these issues, we repurpose +a state-of-the-art image-based transformer to take in multiple frames and +directly predict a coherent trajectory. We introduce two types of lightweight +attention layers: cross-view self-attention to fuse temporal information, and +global cross-attention to bring in larger spatial context. Our method infers 4D +hand trajectories similar to the ground truth while maintaining strong 2D +reprojection alignment. We apply the method to both egocentric and allocentric +videos. It significantly outperforms existing methods in global trajectory +accuracy while being comparable to the state-of-the-art in single-image pose +estimation. Project website: https://judyye.github.io/haptic-www + +
+
+
+
+
+ + ☆ Omni-RGPT: Unifying Image and Video Region-level Understanding via Token + Marks + + +
+ We present Omni-RGPT, a multimodal large language model designed to +facilitate region-level comprehension for both images and videos. To achieve +consistent region representation across spatio-temporal dimensions, we +introduce Token Mark, a set of tokens highlighting the target regions within +the visual feature space. These tokens are directly embedded into spatial +regions using region prompts (e.g., boxes or masks) and simultaneously +incorporated into the text prompt to specify the target, establishing a direct +connection between visual and text tokens. To further support robust video +understanding without requiring tracklets, we introduce an auxiliary task that +guides Token Mark by leveraging the consistency of the tokens, enabling stable +region interpretation across the video. Additionally, we introduce a +large-scale region-level video instruction dataset (RegVID-300k). Omni-RGPT +achieves state-of-the-art results on image and video-based commonsense +reasoning benchmarks while showing strong performance in captioning and +referring expression comprehension tasks. + +
+
+ comment: Project page: https://miranheo.github.io/omni-rgpt/ +
+
+
+
+
+ + ☆ GameFactory: Creating New Games with Generative Interactive Videos + + +
+ Generative game engines have the potential to revolutionize game development +by autonomously creating new content and reducing manual workload. However, +existing video-based game generation methods fail to address the critical +challenge of scene generalization, limiting their applicability to existing +games with fixed styles and scenes. In this paper, we present GameFactory, a +framework focused on exploring scene generalization in game video generation. +To enable the creation of entirely new and diverse games, we leverage +pre-trained video diffusion models trained on open-domain video data. To bridge +the domain gap between open-domain priors and small-scale game dataset, we +propose a multi-phase training strategy that decouples game style learning from +action control, preserving open-domain generalization while achieving action +controllability. Using Minecraft as our data source, we release GF-Minecraft, a +high-quality and diversity action-annotated video dataset for research. +Furthermore, we extend our framework to enable autoregressive +action-controllable game video generation, allowing the production of +unlimited-length interactive game videos. Experimental results demonstrate that +GameFactory effectively generates open-domain, diverse, and action-controllable +game videos, representing a significant step forward in AI-driven game +generation. Our dataset and project page are publicly available at +\url{https://vvictoryuki.github.io/gamefactory/}. + +
+
+
+
+
+ + ☆ Diffusion Adversarial Post-Training for One-Step Video Generation + + +
+ The diffusion models are widely used for image and video generation, but +their iterative generation process is slow and expansive. While existing +distillation approaches have demonstrated the potential for one-step generation +in the image domain, they still suffer from significant quality degradation. In +this work, we propose Adversarial Post-Training (APT) against real data +following diffusion pre-training for one-step video generation. To improve the +training stability and quality, we introduce several improvements to the model +architecture and training procedures, along with an approximated R1 +regularization objective. Empirically, our experiments show that our +adversarial post-trained model, Seaweed-APT, can generate 2-second, 1280x720, +24fps videos in real time using a single forward evaluation step. Additionally, +our model is capable of generating 1024px images in a single step, achieving +quality comparable to state-of-the-art methods. + +
+
+
+
+
+ + ☆ MiniMax-01: Scaling Foundation Models with Lightning Attention + + +
+ We introduce MiniMax-01 series, including MiniMax-Text-01 and MiniMax-VL-01, +which are comparable to top-tier models while offering superior capabilities in +processing longer contexts. The core lies in lightning attention and its +efficient scaling. To maximize computational capacity, we integrate it with +Mixture of Experts (MoE), creating a model with 32 experts and 456 billion +total parameters, of which 45.9 billion are activated for each token. We +develop an optimized parallel strategy and highly efficient +computation-communication overlap techniques for MoE and lightning attention. +This approach enables us to conduct efficient training and inference on models +with hundreds of billions of parameters across contexts spanning millions of +tokens. The context window of MiniMax-Text-01 can reach up to 1 million tokens +during training and extrapolate to 4 million tokens during inference at an +affordable cost. Our vision-language model, MiniMax-VL-01 is built through +continued training with 512 billion vision-language tokens. Experiments on both +standard and in-house benchmarks show that our models match the performance of +state-of-the-art models like GPT-4o and Claude-3.5-Sonnet while offering 20-32 +times longer context window. We publicly release MiniMax-01 at +https://github.com/MiniMax-AI. + +
+
+ comment: A technical report from MiniMax. The authors are listed in + alphabetical order. We open-sourced our MiniMax-01 at + https://github.com/MiniMax-AI +
+
+
+
+
+ + ☆ Advancing Semantic Future Prediction through Multimodal Visual Sequence + Transformers + + +
+ Semantic future prediction is important for autonomous systems navigating +dynamic environments. This paper introduces FUTURIST, a method for multimodal +future semantic prediction that uses a unified and efficient visual sequence +transformer architecture. Our approach incorporates a multimodal masked visual +modeling objective and a novel masking mechanism designed for multimodal +training. This allows the model to effectively integrate visible information +from various modalities, improving prediction accuracy. Additionally, we +propose a VAE-free hierarchical tokenization process, which reduces +computational complexity, streamlines the training pipeline, and enables +end-to-end training with high-resolution, multimodal inputs. We validate +FUTURIST on the Cityscapes dataset, demonstrating state-of-the-art performance +in future semantic segmentation for both short- and mid-term forecasting. We +provide the implementation code at https://github.com/Sta8is/FUTURIST . + +
+
+
+
+
+ + ☆ LayerAnimate: Layer-specific Control for Animation + + +
+ Animated video separates foreground and background elements into layers, with +distinct processes for sketching, refining, coloring, and in-betweening. +Existing video generation methods typically treat animation as a monolithic +data domain, lacking fine-grained control over individual layers. In this +paper, we introduce LayerAnimate, a novel architectural approach that enhances +fine-grained control over individual animation layers within a video diffusion +model, allowing users to independently manipulate foreground and background +elements in distinct layers. To address the challenge of limited layer-specific +data, we propose a data curation pipeline that features automated element +segmentation, motion-state hierarchical merging, and motion coherence +refinement. Through quantitative and qualitative comparisons, and user study, +we demonstrate that LayerAnimate outperforms current methods in terms of +animation quality, control precision, and usability, making it an ideal tool +for both professional animators and amateur enthusiasts. This framework opens +up new possibilities for layer-specific animation applications and creative +flexibility. Our code is available at https://layeranimate.github.io. + +
+
+ comment: Project page: https://layeranimate.github.io +
+
+
+
+
+ + ☆ VINGS-Mono: Visual-Inertial Gaussian Splatting Monocular SLAM in Large + Scenes + + +
+ VINGS-Mono is a monocular (inertial) Gaussian Splatting (GS) SLAM framework +designed for large scenes. The framework comprises four main components: VIO +Front End, 2D Gaussian Map, NVS Loop Closure, and Dynamic Eraser. In the VIO +Front End, RGB frames are processed through dense bundle adjustment and +uncertainty estimation to extract scene geometry and poses. Based on this +output, the mapping module incrementally constructs and maintains a 2D Gaussian +map. Key components of the 2D Gaussian Map include a Sample-based Rasterizer, +Score Manager, and Pose Refinement, which collectively improve mapping speed +and localization accuracy. This enables the SLAM system to handle large-scale +urban environments with up to 50 million Gaussian ellipsoids. To ensure global +consistency in large-scale scenes, we design a Loop Closure module, which +innovatively leverages the Novel View Synthesis (NVS) capabilities of Gaussian +Splatting for loop closure detection and correction of the Gaussian map. +Additionally, we propose a Dynamic Eraser to address the inevitable presence of +dynamic objects in real-world outdoor scenes. Extensive evaluations in indoor +and outdoor environments demonstrate that our approach achieves localization +performance on par with Visual-Inertial Odometry while surpassing recent +GS/NeRF SLAM methods. It also significantly outperforms all existing methods in +terms of mapping and rendering quality. Furthermore, we developed a mobile app +and verified that our framework can generate high-quality Gaussian maps in real +time using only a smartphone camera and a low-frequency IMU sensor. To the best +of our knowledge, VINGS-Mono is the first monocular Gaussian SLAM method +capable of operating in outdoor environments and supporting kilometer-scale +large scenes. + +
+
+
+
+
+ + ☆ Can Bayesian Neural Networks Explicitly Model Input Uncertainty? + + +
+ Inputs to machine learning models can have associated noise or uncertainties, +but they are often ignored and not modelled. It is unknown if Bayesian Neural +Networks and their approximations are able to consider uncertainty in their +inputs. In this paper we build a two input Bayesian Neural Network (mean and +standard deviation) and evaluate its capabilities for input uncertainty +estimation across different methods like Ensembles, MC-Dropout, and Flipout. +Our results indicate that only some uncertainty estimation methods for +approximate Bayesian NNs can model input uncertainty, in particular Ensembles +and Flipout. + +
+
+ comment: 12 pages, 11 figures, VISAPP 2025 camera ready +
+
+
+
+
+ + ☆ LLaVA-ST: A Multimodal Large Language Model for Fine-Grained + Spatial-Temporal Understanding + + +
+ Recent advancements in multimodal large language models (MLLMs) have shown +promising results, yet existing approaches struggle to effectively handle both +temporal and spatial localization simultaneously. This challenge stems from two +key issues: first, incorporating spatial-temporal localization introduces a +vast number of coordinate combinations, complicating the alignment of +linguistic and visual coordinate representations; second, encoding fine-grained +temporal and spatial information during video feature compression is inherently +difficult. To address these issues, we propose LLaVA-ST, a MLLM for +fine-grained spatial-temporal multimodal understanding. In LLaVA-ST, we propose +Language-Aligned Positional Embedding, which embeds the textual coordinate +special token into the visual space, simplifying the alignment of fine-grained +spatial-temporal correspondences. Additionally, we design the Spatial-Temporal +Packer, which decouples the feature compression of temporal and spatial +resolutions into two distinct point-to-region attention processing streams. +Furthermore, we propose ST-Align dataset with 4.3M training samples for +fine-grained spatial-temporal multimodal understanding. With ST-align, we +present a progressive training pipeline that aligns the visual and textual +feature through sequential coarse-to-fine stages.Additionally, we introduce an +ST-Align benchmark to evaluate spatial-temporal interleaved fine-grained +understanding tasks, which include Spatial-Temporal Video Grounding (STVG) , +Event Localization and Captioning (ELC) and Spatial Video Grounding (SVG). +LLaVA-ST achieves outstanding performance on 11 benchmarks requiring +fine-grained temporal, spatial, or spatial-temporal interleaving multimodal +understanding. Our code, data and benchmark will be released at Our code, data +and benchmark will be released at https://github.com/appletea233/LLaVA-ST . + +
+
+
+
+
+ + ☆ SmartEraser: Remove Anything from Images using Masked-Region Guidance + + +
+ Object removal has so far been dominated by the mask-and-inpaint paradigm, +where the masked region is excluded from the input, leaving models relying on +unmasked areas to inpaint the missing region. However, this approach lacks +contextual information for the masked area, often resulting in unstable +performance. In this work, we introduce SmartEraser, built with a new removing +paradigm called Masked-Region Guidance. This paradigm retains the masked region +in the input, using it as guidance for the removal process. It offers several +distinct advantages: (a) it guides the model to accurately identify the object +to be removed, preventing its regeneration in the output; (b) since the user +mask often extends beyond the object itself, it aids in preserving the +surrounding context in the final result. Leveraging this new paradigm, we +present Syn4Removal, a large-scale object removal dataset, where instance +segmentation data is used to copy and paste objects onto images as removal +targets, with the original images serving as ground truths. Experimental +results demonstrate that SmartEraser significantly outperforms existing +methods, achieving superior performance in object removal, especially in +complex scenes with intricate compositions. + +
+
+ comment: Project at: https://longtaojiang.github.io/smarteraser.github.io/ +
+
+
+
+
+ + ☆ AI Driven Water Segmentation with deep learning models for Enhanced + Flood Monitoring + + +
+ Flooding is a major natural hazard causing significant fatalities and +economic losses annually, with increasing frequency due to climate change. +Rapid and accurate flood detection and monitoring are crucial for mitigating +these impacts. This study compares the performance of three deep learning +models UNet, ResNet, and DeepLabv3 for pixelwise water segmentation to aid in +flood detection, utilizing images from drones, in field observations, and +social media. This study involves creating a new dataset that augments +wellknown benchmark datasets with flood-specific images, enhancing the +robustness of the models. The UNet, ResNet, and DeepLab v3 architectures are +tested to determine their effectiveness in various environmental conditions and +geographical locations, and the strengths and limitations of each model are +also discussed here, providing insights into their applicability in different +scenarios by predicting image segmentation masks. This fully automated approach +allows these models to isolate flooded areas in images, significantly reducing +processing time compared to traditional semi-automated methods. The outcome of +this study is to predict segmented masks for each image effected by a flood +disaster and the validation accuracy of these models. This methodology +facilitates timely and continuous flood monitoring, providing vital data for +emergency response teams to reduce loss of life and economic damages. It offers +a significant reduction in the time required to generate flood maps, cutting +down the manual processing time. Additionally, we present avenues for future +research, including the integration of multimodal data sources and the +development of robust deep learning architectures tailored specifically for +flood detection tasks. Overall, our work contributes to the advancement of +flood management strategies through innovative use of deep learning +technologies. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ Towards an End-to-End (E2E) Adversarial Learning and Application in the + Physical World + + +
+ The traditional learning process of patch-based adversarial attacks, +conducted in the digital domain and then applied in the physical domain (e.g., +via printed stickers), may suffer from reduced performance due to adversarial +patches' limited transferability from the digital domain to the physical +domain. Given that previous studies have considered using projectors to apply +adversarial attacks, we raise the following question: can adversarial learning +(i.e., patch generation) be performed entirely in the physical domain with a +projector? In this work, we propose the Physical-domain Adversarial Patch +Learning Augmentation (PAPLA) framework, a novel end-to-end (E2E) framework +that converts adversarial learning from the digital domain to the physical +domain using a projector. We evaluate PAPLA across multiple scenarios, +including controlled laboratory settings and realistic outdoor environments, +demonstrating its ability to ensure attack success compared to conventional +digital learning-physical application (DL-PA) methods. We also analyze the +impact of environmental factors, such as projection surface color, projector +strength, ambient light, distance, and angle of the target object relative to +the camera, on the effectiveness of projected patches. Finally, we demonstrate +the feasibility of the attack against a parked car and a stop sign in a +real-world outdoor environment. Our results show that under specific +conditions, E2E adversarial learning in the physical domain eliminates the +transferability issue and ensures evasion by object detectors. Finally, we +provide insights into the challenges and opportunities of applying adversarial +learning in the physical domain and explain where such an approach is more +effective than using a sticker. + +
+
+
+
+
+ + ☆ Continual Deep Active Learning for Medical Imaging: Replay-Base + Architecture for Context Adaptation + + +
+ Deep Learning for medical imaging faces challenges in adapting and +generalizing to new contexts. Additionally, it often lacks sufficient labeled +data for specific tasks requiring significant annotation effort. Continual +Learning (CL) tackles adaptability and generalizability by enabling lifelong +learning from a data stream while mitigating forgetting of previously learned +knowledge. Active Learning (AL) reduces the number of required annotations for +effective training. This work explores both approaches (CAL) to develop a novel +framework for robust medical image analysis. Based on the automatic recognition +of shifts in image characteristics, Replay-Base Architecture for Context +Adaptation (RBACA) employs a CL rehearsal method to continually learn from +diverse contexts, and an AL component to select the most informative instances +for annotation. A novel approach to evaluate CAL methods is established using a +defined metric denominated IL-Score, which allows for the simultaneous +assessment of transfer learning, forgetting, and final model performance. We +show that RBACA works in domain and class-incremental learning scenarios, by +assessing its IL-Score on the segmentation and diagnosis of cardiac images. The +results show that RBACA outperforms a baseline framework without CAL, and a +state-of-the-art CAL method across various memory sizes and annotation budgets. +Our code is available in https://github.com/RuiDaniel/RBACA . + +
+
+
+
+
+ + ☆ A Feature-Level Ensemble Model for COVID-19 Identification in CXR Images + using Choquet Integral and Differential Evolution Optimization + + +
+ The COVID-19 pandemic has profoundly impacted billions globally. It +challenges public health and healthcare systems due to its rapid spread and +severe respiratory effects. An effective strategy to mitigate the COVID-19 +pandemic involves integrating testing to identify infected individuals. While +RT-PCR is considered the gold standard for diagnosing COVID-19, it has some +limitations such as the risk of false negatives. To address this problem, this +paper introduces a novel Deep Learning Diagnosis System that integrates +pre-trained Deep Convolutional Neural Networks (DCNNs) within an ensemble +learning framework to achieve precise identification of COVID-19 cases from +Chest X-ray (CXR) images. We combine feature vectors from the final hidden +layers of pre-trained DCNNs using the Choquet integral to capture interactions +between different DCNNs that a linear approach cannot. We employed +Sugeno-$\lambda$ measure theory to derive fuzzy measures for subsets of +networks to enable aggregation. We utilized Differential Evolution to estimate +fuzzy densities. We developed a TensorFlow-based layer for Choquet operation to +facilitate efficient aggregation, due to the intricacies involved in +aggregating feature vectors. Experimental results on the COVIDx dataset show +that our ensemble model achieved 98\% accuracy in three-class classification +and 99.50\% in binary classification, outperforming its components-DenseNet-201 +(97\% for three-class, 98.75\% for binary), Inception-v3 (96.25\% for +three-class, 98.50\% for binary), and Xception (94.50\% for three-class, 98\% +for binary)-and surpassing many previous methods. + +
+
+
+
+
+ + ☆ Efficient Deep Learning-based Forward Solvers for Brain Tumor Growth + Models + + +
+ Glioblastoma, a highly aggressive brain tumor, poses major challenges due to +its poor prognosis and high morbidity rates. Partial differential +equation-based models offer promising potential to enhance therapeutic outcomes +by simulating patient-specific tumor behavior for improved radiotherapy +planning. However, model calibration remains a bottleneck due to the high +computational demands of optimization methods like Monte Carlo sampling and +evolutionary algorithms. To address this, we recently introduced an approach +leveraging a neural forward solver with gradient-based optimization to +significantly reduce calibration time. This approach requires a highly accurate +and fully differentiable forward model. We investigate multiple architectures, +including (i) an enhanced TumorSurrogate, (ii) a modified nnU-Net, and (iii) a +3D Vision Transformer (ViT). The optimized TumorSurrogate achieved the best +overall results, excelling in both tumor outline matching and voxel-level +prediction of tumor cell concentration. It halved the MSE relative to the +baseline model and achieved the highest Dice score across all tumor cell +concentration thresholds. Our study demonstrates significant enhancement in +forward solver performance and outlines important future research directions. + +
+
+
+
+
+ + ☆ FramePainter: Endowing Interactive Image Editing with Video Diffusion + Priors + + +
+ Interactive image editing allows users to modify images through visual +interaction operations such as drawing, clicking, and dragging. Existing +methods construct such supervision signals from videos, as they capture how +objects change with various physical interactions. However, these models are +usually built upon text-to-image diffusion models, so necessitate (i) massive +training samples and (ii) an additional reference encoder to learn real-world +dynamics and visual consistency. In this paper, we reformulate this task as an +image-to-video generation problem, so that inherit powerful video diffusion +priors to reduce training costs and ensure temporal consistency. Specifically, +we introduce FramePainter as an efficient instantiation of this formulation. +Initialized with Stable Video Diffusion, it only uses a lightweight sparse +control encoder to inject editing signals. Considering the limitations of +temporal attention in handling large motion between two frames, we further +propose matching attention to enlarge the receptive field while encouraging +dense correspondence between edited and source image tokens. We highlight the +effectiveness and efficiency of FramePainter across various of editing signals: +it domainantly outperforms previous state-of-the-art methods with far less +training data, achieving highly seamless and coherent editing of images, \eg, +automatically adjust the reflection of the cup. Moreover, FramePainter also +exhibits exceptional generalization in scenarios not present in real-world +videos, \eg, transform the clownfish into shark-like shape. Our code will be +available at https://github.com/YBYBZhang/FramePainter. + +
+
+ comment: Code: https://github.com/YBYBZhang/FramePainter +
+
+
+
+
+ + ☆ EmoNeXt: an Adapted ConvNeXt for Facial Emotion Recognition + + +
+ Facial expressions play a crucial role in human communication serving as a +powerful and impactful means to express a wide range of emotions. With +advancements in artificial intelligence and computer vision, deep neural +networks have emerged as effective tools for facial emotion recognition. In +this paper, we propose EmoNeXt, a novel deep learning framework for facial +expression recognition based on an adapted ConvNeXt architecture network. We +integrate a Spatial Transformer Network (STN) to focus on feature-rich regions +of the face and Squeeze-and-Excitation blocks to capture channel-wise +dependencies. Moreover, we introduce a self-attention regularization term, +encouraging the model to generate compact feature vectors. We demonstrate the +superiority of our model over existing state-of-the-art deep learning models on +the FER2013 dataset regarding emotion classification accuracy. + +
+
+ comment: 6 pages, 5 figures and 2 tables. 2023 IEEE 25th International + Workshop on Multimedia Signal Processing (MMSP), Poitiers, France +
+
+
+
+
+ + ☆ Self-supervised Deep Hyperspectral Inpainting with the Plug and Play and + Deep Image Prior Models + + +
+ Hyperspectral images are typically composed of hundreds of narrow and +contiguous spectral bands, each containing information regarding the material +composition of the imaged scene. However, these images can be affected by +various sources of noise, distortions, or data loss, which can significantly +degrade their quality and usefulness. This paper introduces a convergent +guaranteed algorithm, LRS-PnP-DIP(1-Lip), which successfully addresses the +instability issue of DHP that has been reported before. The proposed algorithm +extends the successful joint low-rank and sparse model to further exploit the +underlying data structures beyond the conventional and sometimes restrictive +unions of subspace models. A stability analysis guarantees the convergence of +the proposed algorithm under mild assumptions , which is crucial for its +application in real-world scenarios. Extensive experiments demonstrate that the +proposed solution consistently delivers visually and quantitatively superior +inpainting results, establishing state-of-the-art performance. + +
+
+ comment: 31 pages, 9 Figures, 7 Tables. arXiv admin note: text overlap with + arXiv:2306.08128 +
+
+
+
+
+ + ☆ A Critical Synthesis of Uncertainty Quantification and Foundation Models + in Monocular Depth Estimation + + +
+ While recent foundation models have enabled significant breakthroughs in +monocular depth estimation, a clear path towards safe and reliable deployment +in the real-world remains elusive. Metric depth estimation, which involves +predicting absolute distances, poses particular challenges, as even the most +advanced foundation models remain prone to critical errors. Since quantifying +the uncertainty has emerged as a promising endeavor to address these +limitations and enable trustworthy deployment, we fuse five different +uncertainty quantification methods with the current state-of-the-art +DepthAnythingV2 foundation model. To cover a wide range of metric depth +domains, we evaluate their performance on four diverse datasets. Our findings +identify fine-tuning with the Gaussian Negative Log-Likelihood Loss (GNLL) as a +particularly promising approach, offering reliable uncertainty estimates while +maintaining predictive performance and computational efficiency on par with the +baseline, encompassing both training and inference time. By fusing uncertainty +quantification and foundation models within the context of monocular depth +estimation, this paper lays a critical foundation for future research aimed at +improving not only model performance but also its explainability. Extending +this critical synthesis of uncertainty quantification and foundation models +into other crucial tasks, such as semantic segmentation and pose estimation, +presents exciting opportunities for safer and more reliable machine vision +systems. + +
+
+
+
+
+ + ☆ CG-MER: A Card Game-based Multimodal dataset for Emotion Recognition + + +
+ The field of affective computing has seen significant advancements in +exploring the relationship between emotions and emerging technologies. This +paper presents a novel and valuable contribution to this field with the +introduction of a comprehensive French multimodal dataset designed specifically +for emotion recognition. The dataset encompasses three primary modalities: +facial expressions, speech, and gestures, providing a holistic perspective on +emotions. Moreover, the dataset has the potential to incorporate additional +modalities, such as Natural Language Processing (NLP) to expand the scope of +emotion recognition research. The dataset was curated through engaging +participants in card game sessions, where they were prompted to express a range +of emotions while responding to diverse questions. The study included 10 +sessions with 20 participants (9 females and 11 males). The dataset serves as a +valuable resource for furthering research in emotion recognition and provides +an avenue for exploring the intricate connections between human emotions and +digital technologies. + +
+
+ comment: 8 pages, 2 figures and 4 tables. Sixteenth International Conference + on Machine Vision (ICMV 2023), Yerevan, Armenia +
+
+
+
+
+ + ☆ D$^2$-DPM: Dual Denoising for Quantized Diffusion Probabilistic Models + + +
+ Diffusion models have achieved cutting-edge performance in image generation. +However, their lengthy denoising process and computationally intensive score +estimation network impede their scalability in low-latency and +resource-constrained scenarios. Post-training quantization (PTQ) compresses and +accelerates diffusion models without retraining, but it inevitably introduces +additional quantization noise, resulting in mean and variance deviations. In +this work, we propose D2-DPM, a dual denoising mechanism aimed at precisely +mitigating the adverse effects of quantization noise on the noise estimation +network. Specifically, we first unravel the impact of quantization noise on the +sampling equation into two components: the mean deviation and the variance +deviation. The mean deviation alters the drift coefficient of the sampling +equation, influencing the trajectory trend, while the variance deviation +magnifies the diffusion coefficient, impacting the convergence of the sampling +trajectory. The proposed D2-DPM is thus devised to denoise the quantization +noise at each time step, and then denoise the noisy sample through the inverse +diffusion iterations. Experimental results demonstrate that D2-DPM achieves +superior generation quality, yielding a 1.42 lower FID than the full-precision +model while achieving 3.99x compression and 11.67x bit-operation acceleration. + +
+
+ comment: 9 pages, 4 figures, acceptted by AAAI2025 +
+
+
+
+
+ + ☆ Object-Centric 2D Gaussian Splatting: Background Removal and + Occlusion-Aware Pruning for Compact Object Models + + +
+ Current Gaussian Splatting approaches are effective for reconstructing entire +scenes but lack the option to target specific objects, making them +computationally expensive and unsuitable for object-specific applications. We +propose a novel approach that leverages object masks to enable targeted +reconstruction, resulting in object-centric models. Additionally, we introduce +an occlusion-aware pruning strategy to minimize the number of Gaussians without +compromising quality. Our method reconstructs compact object models, yielding +object-centric Gaussian and mesh representations that are up to 96\% smaller +and up to 71\% faster to train compared to the baseline while retaining +competitive quality. These representations are immediately usable for +downstream applications such as appearance editing and physics simulation +without additional processing. + +
+
+ comment: Accepted at ICPRAM 2025 (https://icpram.scitevents.org/Home.aspx) +
+
+
+
+
+ + ☆ Benchmarking Multimodal Models for Fine-Grained Image Analysis: A + Comparative Study Across Diverse Visual Features + + +
+ This article introduces a benchmark designed to evaluate the capabilities of +multimodal models in analyzing and interpreting images. The benchmark focuses +on seven key visual aspects: main object, additional objects, background, +detail, dominant colors, style, and viewpoint. A dataset of 14,580 images, +generated from diverse text prompts, was used to assess the performance of +seven leading multimodal models. These models were evaluated on their ability +to accurately identify and describe each visual aspect, providing insights into +their strengths and weaknesses for comprehensive image understanding. The +findings of this benchmark have significant implications for the development +and selection of multimodal models for various image analysis tasks. + +
+
+ comment: 6 pages, 2 tables, 2 charts +
+
+
+
+
+ + ☆ Revolutionizing Communication with Deep Learning and XAI for Enhanced + Arabic Sign Language Recognition + + +
+ This study introduces an integrated approach to recognizing Arabic Sign +Language (ArSL) using state-of-the-art deep learning models such as +MobileNetV3, ResNet50, and EfficientNet-B2. These models are further enhanced +by explainable AI (XAI) techniques to boost interpretability. The ArSL2018 and +RGB Arabic Alphabets Sign Language (AASL) datasets are employed, with +EfficientNet-B2 achieving peak accuracies of 99.48\% and 98.99\%, respectively. +Key innovations include sophisticated data augmentation methods to mitigate +class imbalance, implementation of stratified 5-fold cross-validation for +better generalization, and the use of Grad-CAM for clear model decision +transparency. The proposed system not only sets new benchmarks in recognition +accuracy but also emphasizes interpretability, making it suitable for +applications in healthcare, education, and inclusive communication +technologies. + +
+
+ comment: 13 pages, 25 figures, 16 tables +
+
+
+
+
+ + ☆ DM-Mamba: Dual-domain Multi-scale Mamba for MRI reconstruction + + +
+ The accelerated MRI reconstruction poses a challenging ill-posed inverse +problem due to the significant undersampling in k-space. Deep neural networks, +such as CNNs and ViT, have shown substantial performance improvements for this +task while encountering the dilemma between global receptive fields and +efficient computation. To this end, this paper pioneers exploring Mamba, a new +paradigm for long-range dependency modeling with linear complexity, for +efficient and effective MRI reconstruction. However, directly applying Mamba to +MRI reconstruction faces three significant issues: (1) Mamba's row-wise and +column-wise scanning disrupts k-space's unique spectrum, leaving its potential +in k-space learning unexplored. (2) Existing Mamba methods unfold feature maps +with multiple lengthy scanning paths, leading to long-range forgetting and high +computational burden. (3) Mamba struggles with spatially-varying contents, +resulting in limited diversity of local representations. To address these, we +propose a dual-domain multi-scale Mamba for MRI reconstruction from the +following perspectives: (1) We pioneer vision Mamba in k-space learning. A +circular scanning is customized for spectrum unfolding, benefiting the global +modeling of k-space. (2) We propose a multi-scale Mamba with an efficient +scanning strategy in both image and k-space domains. It mitigates long-range +forgetting and achieves a better trade-off between efficiency and performance. +(3) We develop a local diversity enhancement module to improve the +spatially-varying representation of Mamba. Extensive experiments are conducted +on three public datasets for MRI reconstruction under various undersampling +patterns. Comprehensive results demonstrate that our method significantly +outperforms state-of-the-art methods with lower computational cost. +Implementation code will be available at +https://github.com/XiaoMengLiLiLi/DM-Mamba. + +
+
+
+
+
+ + ☆ Energy Backdoor Attack to Deep Neural Networks + + +
+ The rise of deep learning (DL) has increased computing complexity and energy +use, prompting the adoption of application specific integrated circuits (ASICs) +for energy-efficient edge and mobile deployment. However, recent studies have +demonstrated the vulnerability of these accelerators to energy attacks. Despite +the development of various inference time energy attacks in prior research, +backdoor energy attacks remain unexplored. In this paper, we design an +innovative energy backdoor attack against deep neural networks (DNNs) operating +on sparsity-based accelerators. Our attack is carried out in two distinct +phases: backdoor injection and backdoor stealthiness. Experimental results +using ResNet-18 and MobileNet-V2 models trained on CIFAR-10 and Tiny ImageNet +datasets show the effectiveness of our proposed attack in increasing energy +consumption on trigger samples while preserving the model's performance for +clean/regular inputs. This demonstrates the vulnerability of DNNs to energy +backdoor attacks. The source code of our attack is available at: +https://github.com/hbrachemi/energy_backdoor. + +
+
+
+
+
+ + ☆ Bootstrapping Corner Cases: High-Resolution Inpainting for Safety + Critical Detect and Avoid for Automated Flying + + +
+ Modern machine learning techniques have shown tremendous potential, +especially for object detection on camera images. For this reason, they are +also used to enable safety-critical automated processes such as autonomous +drone flights. We present a study on object detection for Detect and Avoid, a +safety critical function for drones that detects air traffic during automated +flights for safety reasons. An ill-posed problem is the generation of good and +especially large data sets, since detection itself is the corner case. Most +models suffer from limited ground truth in raw data, \eg recorded air traffic +or frontal flight with a small aircraft. It often leads to poor and critical +detection rates. We overcome this problem by using inpainting methods to +bootstrap the dataset such that it explicitly contains the corner cases of the +raw data. We provide an overview of inpainting methods and generative models +and present an example pipeline given a small annotated dataset. We validate +our method by generating a high-resolution dataset, which we make publicly +available and present it to an independent object detector that was fully +trained on real data. + +
+
+
+
+
+ + ☆ Audio-visual Deepfake Detection With Local Temporal Inconsistencies + + +
+ This paper proposes an audio-visual deepfake detection approach that aims to +capture fine-grained temporal inconsistencies between audio and visual +modalities. To achieve this, both architectural and data synthesis strategies +are introduced. From an architectural perspective, a temporal distance map, +coupled with an attention mechanism, is designed to capture these +inconsistencies while minimizing the impact of irrelevant temporal +subsequences. Moreover, we explore novel pseudo-fake generation techniques to +synthesize local inconsistencies. Our approach is evaluated against +state-of-the-art methods using the DFDC and FakeAVCeleb datasets, demonstrating +its effectiveness in detecting audio-visual deepfakes. + +
+
+ comment: Accepted in ICASSP 2025 +
+
+
+
+
+ + ☆ SAR Strikes Back: A New Hope for RSVQA + + +
+ Remote sensing visual question answering (RSVQA) is a task that automatically +extracts information from satellite images and processes a question to predict +the answer from the images in textual form, helping with the interpretation of +the image. While different methods have been proposed to extract information +from optical images with different spectral bands and resolutions, no method +has been proposed to answer questions from Synthetic Aperture Radar (SAR) +images. SAR images capture electromagnetic information from the scene, and are +less affected by atmospheric conditions, such as clouds. In this work, our +objective is to introduce SAR in the RSVQA task, finding the best way to use +this modality. In our research, we carry out a study on different pipelines for +the task of RSVQA taking into account information from both SAR and optical +data. To this purpose, we also present a dataset that allows for the +introduction of SAR images in the RSVQA framework. We propose two different +models to include the SAR modality. The first one is an end-to-end method in +which we add an additional encoder for the SAR modality. In the second +approach, we build on a two-stage framework. First, relevant information is +extracted from SAR and, optionally, optical data. This information is then +translated into natural language to be used in the second step which only +relies on a language model to provide the answer. We find that the second +pipeline allows us to obtain good results with SAR images alone. We then try +various types of fusion methods to use SAR and optical images together, finding +that a fusion at the decision level achieves the best results on the proposed +dataset. We show that SAR data offers additional information when fused with +the optical modality, particularly for questions related to specific land cover +classes, such as water areas. + +
+
+ comment: 26 pages, 6 figures +
+
+
+
+
+ + ☆ Revisiting Birds Eye View Perception Models with Frozen Foundation + Models: DINOv2 and Metric3Dv2 + + +
+ Birds Eye View perception models require extensive data to perform and +generalize effectively. While traditional datasets often provide abundant +driving scenes from diverse locations, this is not always the case. It is +crucial to maximize the utility of the available training data. With the advent +of large foundation models such as DINOv2 and Metric3Dv2, a pertinent question +arises: can these models be integrated into existing model architectures to not +only reduce the required training data but surpass the performance of current +models? We choose two model architectures in the vehicle segmentation domain to +alter: Lift-Splat-Shoot, and Simple-BEV. For Lift-Splat-Shoot, we explore the +implementation of frozen DINOv2 for feature extraction and Metric3Dv2 for depth +estimation, where we greatly exceed the baseline results by 7.4 IoU while +utilizing only half the training data and iterations. Furthermore, we introduce +an innovative application of Metric3Dv2's depth information as a PseudoLiDAR +point cloud incorporated into the Simple-BEV architecture, replacing +traditional LiDAR. This integration results in a +3 IoU improvement compared to +the Camera-only model. + +
+
+ comment: Accepted for publication at the Electronic Imaging - Autonomous + Vehicles and Machines Connference 2025 +
+
+
+
+
+ + ☆ RoHan: Robust Hand Detection in Operation Room + + +
+ Hand-specific localization has garnered significant interest within the +computer vision community. Although there are numerous datasets with hand +annotations from various angles and settings, domain transfer techniques +frequently struggle in surgical environments. This is mainly due to the limited +availability of gloved hand instances and the unique challenges of operating +rooms (ORs). Thus, hand-detection models tailored to OR settings require +extensive training and expensive annotation processes. To overcome these +challenges, we present "RoHan" - a novel approach for robust hand detection in +the OR, leveraging advanced semi-supervised domain adaptation techniques to +tackle the challenges of varying recording conditions, diverse glove colors, +and occlusions common in surgical settings. Our methodology encompasses two +main stages: (1) data augmentation strategy that utilizes "Artificial Gloves," +a method for augmenting publicly available hand datasets with synthetic images +of hands-wearing gloves; (2) semi-supervised domain adaptation pipeline that +improves detection performance in real-world OR settings through iterative +prediction refinement and efficient frame filtering. We evaluate our method +using two datasets: simulated enterotomy repair and saphenous vein graft +harvesting. "RoHan" substantially reduces the need for extensive labeling and +model training, paving the way for the practical implementation of hand +detection technologies in medical settings. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Change Captioning in Remote Sensing: Evolution to SAT-Cap -- A + Single-Stage Transformer Approach + + +
+ Change captioning has become essential for accurately describing changes in +multi-temporal remote sensing data, providing an intuitive way to monitor +Earth's dynamics through natural language. However, existing change captioning +methods face two key challenges: high computational demands due to multistage +fusion strategy, and insufficient detail in object descriptions due to limited +semantic extraction from individual images. To solve these challenges, we +propose SAT-Cap based on the transformers model with a single-stage feature +fusion for remote sensing change captioning. In particular, SAT-Cap integrates +a Spatial-Channel Attention Encoder, a Difference-Guided Fusion module, and a +Caption Decoder. Compared to typical models that require multi-stage fusion in +transformer encoder and fusion module, SAT-Cap uses only a simple cosine +similarity-based fusion module for information integration, reducing the +complexity of the model architecture. By jointly modeling spatial and channel +information in Spatial-Channel Attention Encoder, our approach significantly +enhances the model's ability to extract semantic information from objects in +multi-temporal remote sensing images. Extensive experiments validate the +effectiveness of SAT-Cap, achieving CIDEr scores of 140.23% on the LEVIR-CC +dataset and 97.74% on the DUBAI-CC dataset, surpassing current state-of-the-art +methods. The code and pre-trained models will be available online. + +
+
+
+
+
+ + ☆ EarthView: A Large Scale Remote Sensing Dataset for Self-Supervision + + +
+ This paper presents EarthView, a comprehensive dataset specifically designed +for self-supervision on remote sensing data, intended to enhance deep learning +applications on Earth monitoring tasks. The dataset spans 15 tera pixels of +global remote-sensing data, combining imagery from a diverse range of sources, +including NEON, Sentinel, and a novel release of 1m spatial resolution data +from Satellogic. Our dataset provides a wide spectrum of image data with +varying resolutions, harnessed from different sensors and organized coherently +into an accessible HuggingFace dataset in parquet format. This data spans five +years, from 2017 to 2022. Accompanying the dataset, we introduce EarthMAE, a +tailored Masked Autoencoder, developed to tackle the distinct challenges of +remote sensing data. Trained in a self-supervised fashion, EarthMAE effectively +processes different data modalities such as hyperspectral, multispectral, +topographical data, segmentation maps, and temporal structure. This model helps +us show that pre-training on Satellogic data improves performance on downstream +tasks. While there is still a gap to fill in MAE for heterogeneous data, we +regard this innovative combination of an expansive, diverse dataset and a +versatile model adapted for self-supervised learning as a stride forward in +deep learning for Earth monitoring. + +
+
+ comment: 2nd Workshop on Computer Vision for Earth Observation (CV4EO) + Applications +
+
+
+
+
+ + ☆ Guiding the classification of hepatocellular carcinoma on 3D CT-scans + using deep and handcrafted radiological features + + +
+ Hepatocellular carcinoma is the most spread primary liver cancer across the +world ($\sim$80\% of the liver tumors). The gold standard for HCC diagnosis is +liver biopsy. However, in the clinical routine, expert radiologists provide a +visual diagnosis by interpreting hepatic CT-scans according to a standardized +protocol, the LI-RADS, which uses five radiological criteria with an associated +decision tree. In this paper, we propose an automatic approach to predict +histology-proven HCC from CT images in order to reduce radiologists' +inter-variability. We first show that standard deep learning methods fail to +accurately predict HCC from CT-scans on a challenging database, and propose a +two-step approach inspired by the LI-RADS system to improve the performance. We +achieve improvements from 6 to 18 points of AUC with respect to deep learning +baselines trained with different architectures. We also provide clinical +validation of our method, achieving results that outperform non-expert +radiologists and are on par with expert ones. + +
+
+ comment: IEEE ISBI 2025 +
+
+
+
+
+ + ☆ CellOMaps: A Compact Representation for Robust Classification of Lung + Adenocarcinoma Growth Patterns + + +
+ Lung adenocarcinoma (LUAD) is a morphologically heterogeneous disease, +characterized by five primary histological growth patterns. The classification +of such patterns is crucial due to their direct relation to prognosis but the +high subjectivity and observer variability pose a major challenge. Although +several studies have developed machine learning methods for growth pattern +classification, they either only report the predominant pattern per slide or +lack proper evaluation. We propose a generalizable machine learning pipeline +capable of classifying lung tissue into one of the five patterns or as +non-tumor. The proposed pipeline's strength lies in a novel compact Cell +Organization Maps (cellOMaps) representation that captures the cellular spatial +patterns from Hematoxylin and Eosin whole slide images (WSIs). The proposed +pipeline provides state-of-the-art performance on LUAD growth pattern +classification when evaluated on both internal unseen slides and external +datasets, significantly outperforming the current approaches. In addition, our +preliminary results show that the model's outputs can be used to predict +patients Tumor Mutational Burden (TMB) levels. + +
+
+
+
+
+ + ☆ AgentPose: Progressive Distribution Alignment via Feature Agent for + Human Pose Distillation + + +
+ Pose distillation is widely adopted to reduce model size in human pose +estimation. However, existing methods primarily emphasize the transfer of +teacher knowledge while often neglecting the performance degradation resulted +from the curse of capacity gap between teacher and student. To address this +issue, we propose AgentPose, a novel pose distillation method that integrates a +feature agent to model the distribution of teacher features and progressively +aligns the distribution of student features with that of the teacher feature, +effectively overcoming the capacity gap and enhancing the ability of knowledge +transfer. Our comprehensive experiments conducted on the COCO dataset +substantiate the effectiveness of our method in knowledge transfer, +particularly in scenarios with a high capacity gap. + +
+
+ comment: 5 pages, 1 figures +
+
+
+
+
+ + ☆ Benchmarking Vision Foundation Models for Input Monitoring in Autonomous + Driving + + +
+ Deep neural networks (DNNs) remain challenged by distribution shifts in +complex open-world domains like automated driving (AD): Absolute robustness +against yet unknown novel objects (semantic shift) or styles like lighting +conditions (covariate shift) cannot be guaranteed. Hence, reliable +operation-time monitors for identification of out-of-training-data-distribution +(OOD) scenarios are imperative. Current approaches for OOD classification are +untested for complex domains like AD, are limited in the kinds of shifts they +detect, or even require supervision with OOD samples. To prepare for +unanticipated shifts, we instead establish a framework around a principled, +unsupervised, and model-agnostic method that unifies detection of all kinds of +shifts: Find a full model of the training data's feature distribution, to then +use its density at new points as in-distribution (ID) score. To implement this, +we propose to combine the newly available Vision Foundation Models (VFM) as +feature extractors with one of four alternative density modeling techniques. In +an extensive benchmark of 4 VFMs against 20 baselines, we show the superior +performance of VFM feature encodings compared to shift-specific OOD monitors. +Additionally, we find that sophisticated architectures outperform larger latent +space dimensionality; and our method identifies samples with higher risk of +errors on downstream tasks, despite being model-agnostic. This suggests that +VFMs are promising to realize model-agnostic, unsupervised, reliable safety +monitors in complex vision tasks. + +
+
+
+
+
+ + ☆ Skeleton and Font Generation Network for Zero-shot Chinese Character + Generation + + +
+ Automatic font generation remains a challenging research issue, primarily due +to the vast number of Chinese characters, each with unique and intricate +structures. Our investigation of previous studies reveals inherent bias capable +of causing structural changes in characters. Specifically, when generating a +Chinese character similar to, but different from, those in the training +samples, the bias is prone to either correcting or ignoring these subtle +variations. To address this concern, we propose a novel Skeleton and Font +Generation Network (SFGN) to achieve a more robust Chinese character font +generation. Our approach includes a skeleton builder and font generator. The +skeleton builder synthesizes content features using low-resource text input, +enabling our technique to realize font generation independently of content +image inputs. Unlike previous font generation methods that treat font style as +a global embedding, we introduce a font generator to align content and style +features on the radical level, which is a brand-new perspective for font +generation. Except for common characters, we also conduct experiments on +misspelled characters, a substantial portion of which slightly differs from the +common ones. Our approach visually demonstrates the efficacy of generated +images and outperforms current state-of-the-art font generation methods. +Moreover, we believe that misspelled character generation have significant +pedagogical implications and verify such supposition through experiments. We +used generated misspelled characters as data augmentation in Chinese character +error correction tasks, simulating the scenario where students learn +handwritten Chinese characters with the help of misspelled characters. The +significantly improved performance of error correction tasks demonstrates the +effectiveness of our proposed approach and the value of misspelled character +generation. + +
+
+ comment: 36 pages, 10 figures +
+
+
+
+
+ + ☆ Self-Attentive Spatio-Temporal Calibration for Precise Intermediate + Layer Matching in ANN-to-SNN Distillation + + +
+ Spiking Neural Networks (SNNs) are promising for low-power computation due to +their event-driven mechanism but often suffer from lower accuracy compared to +Artificial Neural Networks (ANNs). ANN-to-SNN knowledge distillation can +improve SNN performance, but previous methods either focus solely on label +information, missing valuable intermediate layer features, or use a layer-wise +approach that neglects spatial and temporal semantic inconsistencies, leading +to performance degradation.To address these limitations, we propose a novel +method called self-attentive spatio-temporal calibration (SASTC). SASTC uses +self-attention to identify semantically aligned layer pairs between ANN and +SNN, both spatially and temporally. This enables the autonomous transfer of +relevant semantic information. Extensive experiments show that SASTC +outperforms existing methods, effectively solving the mismatching problem. +Superior accuracy results include 95.12% on CIFAR-10, 79.40% on CIFAR-100 with +2 time steps, and 68.69% on ImageNet with 4 time steps for static datasets, and +97.92% on DVS-Gesture and 83.60% on DVS-CIFAR10 for neuromorphic datasets. This +marks the first time SNNs have outperformed ANNs on both CIFAR-10 and +CIFAR-100, shedding the new light on the potential applications of SNNs. + +
+
+
+
+
+ + ☆ Exploring visual language models as a powerful tool in the diagnosis of + Ewing Sarcoma + + +
+ Ewing's sarcoma (ES), characterized by a high density of small round blue +cells without structural organization, presents a significant health concern, +particularly among adolescents aged 10 to 19. Artificial intelligence-based +systems for automated analysis of histopathological images are promising to +contribute to an accurate diagnosis of ES. In this context, this study explores +the feature extraction ability of different pre-training strategies for +distinguishing ES from other soft tissue or bone sarcomas with similar +morphology in digitized tissue microarrays for the first time, as far as we +know. Vision-language supervision (VLS) is compared to fully-supervised +ImageNet pre-training within a multiple instance learning paradigm. Our +findings indicate a substantial improvement in diagnostic accuracy with the +adaption of VLS using an in-domain dataset. Notably, these models not only +enhance the accuracy of predicted classes but also drastically reduce the +number of trainable parameters and computational costs. + +
+
+ comment: 11 pages, 5 figures, 2 tables. Oral presentation at KES-InMed 2024 + held in Madeira, Portugal +
+
+
+
+
+ + ☆ Robust Low-Light Human Pose Estimation through Illumination-Texture + Modulation + + +
+ As critical visual details become obscured, the low visibility and high ISO +noise in extremely low-light images pose a significant challenge to human pose +estimation. Current methods fail to provide high-quality representations due to +reliance on pixel-level enhancements that compromise semantics and the +inability to effectively handle extreme low-light conditions for robust feature +learning. In this work, we propose a frequency-based framework for low-light +human pose estimation, rooted in the "divide-and-conquer" principle. Instead of +uniformly enhancing the entire image, our method focuses on task-relevant +information. By applying dynamic illumination correction to the low-frequency +components and low-rank denoising to the high-frequency components, we +effectively enhance both the semantic and texture information essential for +accurate pose estimation. As a result, this targeted enhancement method results +in robust, high-quality representations, significantly improving pose +estimation performance. Extensive experiments demonstrating its superiority +over state-of-the-art methods in various challenging low-light scenarios. + +
+
+ comment: 5 pages, 2 figures, conference +
+
+
+
+
+ + ☆ DisCoPatch: Batch Statistics Are All You Need For OOD Detection, But + Only If You Can Trust Them + + +
+ Out-of-distribution (OOD) detection holds significant importance across many +applications. While semantic and domain-shift OOD problems are well-studied, +this work focuses on covariate shifts - subtle variations in the data +distribution that can degrade machine learning performance. We hypothesize that +detecting these subtle shifts can improve our understanding of in-distribution +boundaries, ultimately improving OOD detection. In adversarial discriminators +trained with Batch Normalization (BN), real and adversarial samples form +distinct domains with unique batch statistics - a property we exploit for OOD +detection. We introduce DisCoPatch, an unsupervised Adversarial Variational +Autoencoder (VAE) framework that harnesses this mechanism. During inference, +batches consist of patches from the same image, ensuring a consistent data +distribution that allows the model to rely on batch statistics. DisCoPatch uses +the VAE's suboptimal outputs (generated and reconstructed) as negative samples +to train the discriminator, thereby improving its ability to delineate the +boundary between in-distribution samples and covariate shifts. By tightening +this boundary, DisCoPatch achieves state-of-the-art results in public OOD +detection benchmarks. The proposed model not only excels in detecting covariate +shifts, achieving 95.5% AUROC on ImageNet-1K(-C) but also outperforms all prior +methods on public Near-OOD (95.0%) benchmarks. With a compact model size of +25MB, it achieves high OOD detection performance at notably lower latency than +existing methods, making it an efficient and practical solution for real-world +OOD detection applications. The code will be made publicly available + +
+
+
+
+
+ + ☆ Maximizing Uncertainty for Federated learning via Bayesian + Optimisation-based Model Poisoning + + +
+ As we transition from Narrow Artificial Intelligence towards Artificial Super +Intelligence, users are increasingly concerned about their privacy and the +trustworthiness of machine learning (ML) technology. A common denominator for +the metrics of trustworthiness is the quantification of uncertainty inherent in +DL algorithms, and specifically in the model parameters, input data, and model +predictions. One of the common approaches to address privacy-related issues in +DL is to adopt distributed learning such as federated learning (FL), where +private raw data is not shared among users. Despite the privacy-preserving +mechanisms in FL, it still faces challenges in trustworthiness. Specifically, +the malicious users, during training, can systematically create malicious model +parameters to compromise the models predictive and generative capabilities, +resulting in high uncertainty about their reliability. To demonstrate malicious +behaviour, we propose a novel model poisoning attack method named Delphi which +aims to maximise the uncertainty of the global model output. We achieve this by +taking advantage of the relationship between the uncertainty and the model +parameters of the first hidden layer of the local model. Delphi employs two +types of optimisation , Bayesian Optimisation and Least Squares Trust Region, +to search for the optimal poisoned model parameters, named as Delphi-BO and +Delphi-LSTR. We quantify the uncertainty using the KL Divergence to minimise +the distance of the predictive probability distribution towards an uncertain +distribution of model output. Furthermore, we establish a mathematical proof +for the attack effectiveness demonstrated in FL. Numerical results demonstrate +that Delphi-BO induces a higher amount of uncertainty than Delphi-LSTR +highlighting vulnerability of FL systems to model poisoning attacks. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Combining imaging and shape features for prediction tasks of Alzheimer's + disease classification and brain age regression + + +
+ We investigate combining imaging and shape features extracted from MRI for +the clinically relevant tasks of brain age prediction and Alzheimer's disease +classification. Our proposed model fuses ResNet-extracted image embeddings with +shape embeddings from a bespoke graph neural network. The shape embeddings are +derived from surface meshes of 15 brain structures, capturing detailed +geometric information. Combined with the appearance features from T1-weighted +images, we observe improvements in the prediction performance on both tasks, +with substantial gains for classification. We evaluate the model using public +datasets, including CamCAN, IXI, and OASIS3, demonstrating the effectiveness of +fusing imaging and shape features for brain analysis. + +
+
+
+
+
+ + ☆ GAC-Net_Geometric and attention-based Network for Depth Completion + + +
+ Depth completion is a key task in autonomous driving, aiming to complete +sparse LiDAR depth measurements into high-quality dense depth maps through +image guidance. However, existing methods usually treat depth maps as an +additional channel of color images, or directly perform convolution on sparse +data, failing to fully exploit the 3D geometric information in depth maps, +especially with limited performance in complex boundaries and sparse areas. To +address these issues, this paper proposes a depth completion network combining +channel attention mechanism and 3D global feature perception (CGA-Net). The +main innovations include: 1) Utilizing PointNet++ to extract global 3D +geometric features from sparse depth maps, enhancing the scene perception +ability of low-line LiDAR data; 2) Designing a channel-attention-based +multimodal feature fusion module to efficiently integrate sparse depth, RGB +images, and 3D geometric features; 3) Combining residual learning with CSPN++ +to optimize the depth refinement stage, further improving the completion +quality in edge areas and complex scenes. Experiments on the KITTI depth +completion dataset show that CGA-Net can significantly improve the prediction +accuracy of dense depth maps, achieving a new state-of-the-art (SOTA), and +demonstrating strong robustness to sparse and complex scenes. + +
+
+ comment: 13pages,4 figures, 2 tables +
+
+
+
+
+ + ☆ Threshold Attention Network for Semantic Segmentation of Remote Sensing + Images + + +
+ Semantic segmentation of remote sensing images is essential for various +applications, including vegetation monitoring, disaster management, and urban +planning. Previous studies have demonstrated that the self-attention mechanism +(SA) is an effective approach for designing segmentation networks that can +capture long-range pixel dependencies. SA enables the network to model the +global dependencies between the input features, resulting in improved +segmentation outcomes. However, the high density of attentional feature maps +used in this mechanism causes exponential increases in computational +complexity. Additionally, it introduces redundant information that negatively +impacts the feature representation. Inspired by traditional threshold +segmentation algorithms, we propose a novel threshold attention mechanism +(TAM). This mechanism significantly reduces computational effort while also +better modeling the correlation between different regions of the feature map. +Based on TAM, we present a threshold attention network (TANet) for semantic +segmentation. TANet consists of an attentional feature enhancement module +(AFEM) for global feature enhancement of shallow features and a threshold +attention pyramid pooling module (TAPP) for acquiring feature information at +different scales for deep features. We have conducted extensive experiments on +the ISPRS Vaihingen and Potsdam datasets. The results demonstrate the validity +and superiority of our proposed TANet compared to the most state-of-the-art +models. + +
+
+
+
+
+ + ☆ V-Trans4Style: Visual Transition Recommendation for Video Production + Style Adaptation + + +
+ We introduce V-Trans4Style, an innovative algorithm tailored for dynamic +video content editing needs. It is designed to adapt videos to different +production styles like documentaries, dramas, feature films, or a specific +YouTube channel's video-making technique. Our algorithm recommends optimal +visual transitions to help achieve this flexibility using a more bottom-up +approach. We first employ a transformer-based encoder-decoder network to learn +recommending temporally consistent and visually seamless sequences of visual +transitions using only the input videos. We then introduce a style conditioning +module that leverages this model to iteratively adjust the visual transitions +obtained from the decoder through activation maximization. We demonstrate the +efficacy of our method through experiments conducted on our newly introduced +AutoTransition++ dataset. It is a 6k video version of AutoTransition Dataset +that additionally categorizes its videos into different production style +categories. Our encoder-decoder model outperforms the state-of-the-art +transition recommendation method, achieving improvements of 10% to 80% in +Recall@K and mean rank values over baseline. Our style conditioning module +results in visual transitions that improve the capture of the desired video +production style characteristics by an average of around 12% in comparison to +other methods when measured with similarity metrics. We hope that our work +serves as a foundation for exploring and understanding video production styles +further. + +
+
+
+
+
+ + ☆ Facial Dynamics in Video: Instruction Tuning for Improved Facial + Expression Perception and Contextual Awareness + + +
+ Facial expression captioning has found widespread application across various +domains. Recently, the emergence of video Multimodal Large Language Models +(MLLMs) has shown promise in general video understanding tasks. However, +describing facial expressions within videos poses two major challenges for +these models: (1) the lack of adequate datasets and benchmarks, and (2) the +limited visual token capacity of video MLLMs. To address these issues, this +paper introduces a new instruction-following dataset tailored for dynamic +facial expression caption. The dataset comprises 5,033 high-quality video clips +annotated manually, containing over 700,000 tokens. Its purpose is to improve +the capability of video MLLMs to discern subtle facial nuances. Furthermore, we +propose FaceTrack-MM, which leverages a limited number of tokens to encode the +main character's face. This model demonstrates superior performance in tracking +faces and focusing on the facial expressions of the main characters, even in +intricate multi-person scenarios. Additionally, we introduce a novel evaluation +metric combining event extraction, relation classification, and the longest +common subsequence (LCS) algorithm to assess the content consistency and +temporal sequence consistency of generated text. Moreover, we present +FEC-Bench, a benchmark designed to assess the performance of existing video +MLLMs in this specific task. All data and source code will be made publicly +available. + +
+
+
+
+
+ + ☆ Zero-shot Video Moment Retrieval via Off-the-shelf Multimodal Large + Language Models + + +
+ The target of video moment retrieval (VMR) is predicting temporal spans +within a video that semantically match a given linguistic query. Existing VMR +methods based on multimodal large language models (MLLMs) overly rely on +expensive high-quality datasets and time-consuming fine-tuning. Although some +recent studies introduce a zero-shot setting to avoid fine-tuning, they +overlook inherent language bias in the query, leading to erroneous +localization. To tackle the aforementioned challenges, this paper proposes +Moment-GPT, a tuning-free pipeline for zero-shot VMR utilizing frozen MLLMs. +Specifically, we first employ LLaMA-3 to correct and rephrase the query to +mitigate language bias. Subsequently, we design a span generator combined with +MiniGPT-v2 to produce candidate spans adaptively. Finally, to leverage the +video comprehension capabilities of MLLMs, we apply VideoChatGPT and span +scorer to select the most appropriate spans. Our proposed method substantially +outperforms the state-ofthe-art MLLM-based and zero-shot models on several +public datasets, including QVHighlights, ActivityNet-Captions, and +Charades-STA. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ☆ SkipClick: Combining Quick Responses and Low-Level Features for + Interactive Segmentation in Winter Sports Contexts + + +
+ In this paper, we present a novel architecture for interactive segmentation +in winter sports contexts. The field of interactive segmentation deals with the +prediction of high-quality segmentation masks by informing the network about +the objects position with the help of user guidance. In our case the guidance +consists of click prompts. For this task, we first present a baseline +architecture which is specifically geared towards quickly responding after each +click. Afterwards, we motivate and describe a number of architectural +modifications which improve the performance when tasked with segmenting winter +sports equipment on the WSESeg dataset. With regards to the average NoC@85 +metric on the WSESeg classes, we outperform SAM and HQ-SAM by 2.336 and 7.946 +clicks, respectively. When applied to the HQSeg-44k dataset, our system +delivers state-of-the-art results with a NoC@90 of 6.00 and NoC@95 of 9.89. In +addition to that, we test our model on a novel dataset containing masks for +humans during skiing. + +
+
+ comment: 4 figures, 6 tables, 12 pages +
+
+
+
+
+ + ☆ AI Guide Dog: Egocentric Path Prediction on Smartphone + + +
+ This paper introduces AI Guide Dog (AIGD), a lightweight egocentric +navigation assistance system for visually impaired individuals, designed for +real-time deployment on smartphones. AIGD addresses key challenges in blind +navigation by employing a vision-only, multi-label classification approach to +predict directional commands, ensuring safe traversal across diverse +environments. We propose a novel technique to enable goal-based outdoor +navigation by integrating GPS signals and high-level directions, while also +addressing uncertain multi-path predictions for destination-free indoor +navigation. Our generalized model is the first navigation assistance system to +handle both goal-oriented and exploratory navigation scenarios across indoor +and outdoor settings, establishing a new state-of-the-art in blind navigation. +We present methods, datasets, evaluations, and deployment insights to encourage +further innovations in assistive navigation systems. + +
+
+
+
+
+ + ☆ Robust Hyperspectral Image Panshapring via Sparse Spatial-Spectral + Representation RSS 2025 + + +
+ High-resolution hyperspectral imaging plays a crucial role in various remote +sensing applications, yet its acquisition often faces fundamental limitations +due to hardware constraints. This paper introduces S$^{3}$RNet, a novel +framework for hyperspectral image pansharpening that effectively combines +low-resolution hyperspectral images (LRHSI) with high-resolution multispectral +images (HRMSI) through sparse spatial-spectral representation. The core of +S$^{3}$RNet is the Multi-Branch Fusion Network (MBFN), which employs parallel +branches to capture complementary features at different spatial and spectral +scales. Unlike traditional approaches that treat all features equally, our +Spatial-Spectral Attention Weight Block (SSAWB) dynamically adjusts feature +weights to maintain sparse representation while suppressing noise and +redundancy. To enhance feature propagation, we incorporate the Dense Feature +Aggregation Block (DFAB), which efficiently aggregates inputted features +through dense connectivity patterns. This integrated design enables S$^{3}$RNet +to selectively emphasize the most informative features from differnt scale +while maintaining computational efficiency. Comprehensive experiments +demonstrate that S$^{3}$RNet achieves state-of-the-art performance across +multiple evaluation metrics, showing particular strength in maintaining high +reconstruction quality even under challenging noise conditions. The code will +be made publicly available. + +
+
+ comment: Submitted to IGARSS 2025 +
+
+
+
+
+ + ☆ Early prediction of the transferability of bovine embryos from + videomicroscopy + + +
+ Videomicroscopy is a promising tool combined with machine learning for +studying the early development of in vitro fertilized bovine embryos and +assessing its transferability as soon as possible. We aim to predict the embryo +transferability within four days at most, taking 2D time-lapse microscopy +videos as input. We formulate this problem as a supervised binary +classification problem for the classes transferable and not transferable. The +challenges are three-fold: 1) poorly discriminating appearance and motion, 2) +class ambiguity, 3) small amount of annotated data. We propose a 3D +convolutional neural network involving three pathways, which makes it +multi-scale in time and able to handle appearance and motion in different ways. +For training, we retain the focal loss. Our model, named SFR, compares +favorably to other methods. Experiments demonstrate its effectiveness and +accuracy for our challenging biological task. + +
+
+ comment: Accepted at the 2024 IEEE International Conference on Image + Processing +
+
+
+
+
+ + ☆ VENOM: Text-driven Unrestricted Adversarial Example Generation with + Diffusion Models + + +
+ Adversarial attacks have proven effective in deceiving machine learning +models by subtly altering input images, motivating extensive research in recent +years. Traditional methods constrain perturbations within $l_p$-norm bounds, +but advancements in Unrestricted Adversarial Examples (UAEs) allow for more +complex, generative-model-based manipulations. Diffusion models now lead UAE +generation due to superior stability and image quality over GANs. However, +existing diffusion-based UAE methods are limited to using reference images and +face challenges in generating Natural Adversarial Examples (NAEs) directly from +random noise, often producing uncontrolled or distorted outputs. In this work, +we introduce VENOM, the first text-driven framework for high-quality +unrestricted adversarial examples generation through diffusion models. VENOM +unifies image content generation and adversarial synthesis into a single +reverse diffusion process, enabling high-fidelity adversarial examples without +sacrificing attack success rate (ASR). To stabilize this process, we +incorporate an adaptive adversarial guidance strategy with momentum, ensuring +that the generated adversarial examples $x^*$ align with the distribution +$p(x)$ of natural images. Extensive experiments demonstrate that VENOM achieves +superior ASR and image quality compared to prior methods, marking a significant +advancement in adversarial example generation and providing insights into model +vulnerabilities for improved defense development. + +
+
+
+
+
+ + ☆ Cloud Removal With PolSAR-Optical Data Fusion Using A Two-Flow Residual + Network + + +
+ Optical remote sensing images play a crucial role in the observation of the +Earth's surface. However, obtaining complete optical remote sensing images is +challenging due to cloud cover. Reconstructing cloud-free optical images has +become a major task in recent years. This paper presents a two-flow +Polarimetric Synthetic Aperture Radar (PolSAR)-Optical data fusion cloud +removal algorithm (PODF-CR), which achieves the reconstruction of missing +optical images. PODF-CR consists of an encoding module and a decoding module. +The encoding module includes two parallel branches that extract PolSAR image +features and optical image features. To address speckle noise in PolSAR images, +we introduce dynamic filters in the PolSAR branch for image denoising. To +better facilitate the fusion between multimodal optical images and PolSAR +images, we propose fusion blocks based on cross-skip connections to enable +interaction of multimodal data information. The obtained fusion features are +refined through an attention mechanism to provide better conditions for the +subsequent decoding of the fused images. In the decoding module, multi-scale +convolution is introduced to obtain multi-scale information. Additionally, to +better utilize comprehensive scattering information and polarization +characteristics to assist in the restoration of optical images, we use a +dataset for cloud restoration called OPT-BCFSAR-PFSAR, which includes +backscatter coefficient feature images and polarization feature images obtained +from PoLSAR data and optical images. Experimental results demonstrate that this +method outperforms existing methods in both qualitative and quantitative +evaluations. + +
+
+
+
+
+ + ☆ Demographic Variability in Face Image Quality Measures + + +
+ Face image quality assessment (FIQA) algorithms are being integrated into +online identity management applications. These applications allow users to +upload a face image as part of their document issuance process, where the image +is then run through a quality assessment process to make sure it meets the +quality and compliance requirements. Concerns about demographic bias have been +raised about biometric systems, given the societal implications this may cause. +It is therefore important that demographic variability in FIQA algorithms is +assessed such that mitigation measures can be created. In this work, we study +the demographic variability of all face image quality measures included in the +ISO/IEC 29794-5 international standard across three demographic variables: age, +gender, and skin tone. The results are rather promising and show no clear bias +toward any specific demographic group for most measures. Only two quality +measures are found to have considerable variations in their outcomes for +different groups on the skin tone variable. + +
+
+
+
+
+ + ☆ Tarsier2: Advancing Large Vision-Language Models from Detailed Video + Description to Comprehensive Video Understanding + + +
+ We introduce Tarsier2, a state-of-the-art large vision-language model (LVLM) +designed for generating detailed and accurate video descriptions, while also +exhibiting superior general video understanding capabilities. Tarsier2 achieves +significant advancements through three key upgrades: (1) Scaling pre-training +data from 11M to 40M video-text pairs, enriching both volume and diversity; (2) +Performing fine-grained temporal alignment during supervised fine-tuning; (3) +Using model-based sampling to automatically construct preference data and +applying DPO training for optimization. Extensive experiments show that +Tarsier2-7B consistently outperforms leading proprietary models, including +GPT-4o and Gemini 1.5 Pro, in detailed video description tasks. On the DREAM-1K +benchmark, Tarsier2-7B improves F1 by 2.8\% over GPT-4o and 5.8\% over +Gemini-1.5-Pro. In human side-by-side evaluations, Tarsier2-7B shows a +8.6\% +performance advantage over GPT-4o and +24.9\% over Gemini-1.5-Pro. Tarsier2-7B +also sets new state-of-the-art results across 15 public benchmarks, spanning +tasks such as video question-answering, video grounding, hallucination test, +and embodied question-answering, demonstrating its versatility as a robust +generalist vision-language model. + +
+
+
+
+
+ + ☆ Mitigating Algorithmic Bias in Multiclass CNN Classifications Using + Causal Modeling + + +
+ This study describes a procedure for applying causal modeling to detect and +mitigate algorithmic bias in a multiclass classification problem. The dataset +was derived from the FairFace dataset, supplemented with emotional labels +generated by the DeepFace pre-trained model. A custom Convolutional Neural +Network (CNN) was developed, consisting of four convolutional blocks, followed +by fully connected layers and dropout layers to mitigate overfitting. Gender +bias was identified in the CNN model's classifications: Females were more +likely to be classified as "happy" or "sad," while males were more likely to be +classified as "neutral." To address this, the one-vs-all (OvA) technique was +applied. A causal model was constructed for each emotion class to adjust the +CNN model's predicted class probabilities. The adjusted probabilities for the +various classes were then aggregated by selecting the class with the highest +probability. The resulting debiased classifications demonstrated enhanced +gender fairness across all classes, with negligible impact--or even a slight +improvement--on overall accuracy. This study highlights that algorithmic +fairness and accuracy are not necessarily trade-offs. All data and code for +this study are publicly available for download. + +
+
+ comment: 7 pages; 6 figures +
+
+
+
+
+ + ☆ Make-A-Character 2: Animatable 3D Character Generation From a Single + Image + + +
+ This report introduces Make-A-Character 2, an advanced system for generating +high-quality 3D characters from single portrait photographs, ideal for game +development and digital human applications. Make-A-Character 2 builds upon its +predecessor by incorporating several significant improvements for image-based +head generation. We utilize the IC-Light method to correct non-ideal +illumination in input photos and apply neural network-based color correction to +harmonize skin tones between the photos and game engine renders. We also employ +the Hierarchical Representation Network to capture high-frequency facial +structures and conduct adaptive skeleton calibration for accurate and +expressive facial animations. The entire image-to-3D-character generation +process takes less than 2 minutes. Furthermore, we leverage transformer +architecture to generate co-speech facial and gesture actions, enabling +real-time conversation with the generated character. These technologies have +been integrated into our conversational AI avatar products. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ deepTerra -- AI Land Classification Made Easy + + +
+ deepTerra is a comprehensive platform designed to facilitate the +classification of land surface features using machine learning and satellite +imagery. The platform includes modules for data collection, image augmentation, +training, testing, and prediction, streamlining the entire workflow for image +classification tasks. This paper presents a detailed overview of the +capabilities of deepTerra, shows how it has been applied to various research +areas, and discusses the future directions it might take. + +
+
+
+
+
+ + ☆ State-of-the-Art Transformer Models for Image Super-Resolution: + Techniques, Challenges, and Applications + + +
+ Image Super-Resolution (SR) aims to recover a high-resolution image from its +low-resolution counterpart, which has been affected by a specific degradation +process. This is achieved by enhancing detail and visual quality. Recent +advancements in transformer-based methods have remolded image super-resolution +by enabling high-quality reconstructions surpassing previous deep-learning +approaches like CNN and GAN-based. This effectively addresses the limitations +of previous methods, such as limited receptive fields, poor global context +capture, and challenges in high-frequency detail recovery. Additionally, the +paper reviews recent trends and advancements in transformer-based SR models, +exploring various innovative techniques and architectures that combine +transformers with traditional networks to balance global and local contexts. +These neoteric methods are critically analyzed, revealing promising yet +unexplored gaps and potential directions for future research. Several +visualizations of models and techniques are included to foster a holistic +understanding of recent trends. This work seeks to offer a structured roadmap +for researchers at the forefront of deep learning, specifically exploring the +impact of transformers on super-resolution techniques. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ An Intra- and Cross-frame Topological Consistency Scheme for + Semi-supervised Atherosclerotic Coronary Plaque Segmentation + + +
+ Enhancing the precision of segmenting coronary atherosclerotic plaques from +CT Angiography (CTA) images is pivotal for advanced Coronary Atherosclerosis +Analysis (CAA), which distinctively relies on the analysis of vessel +cross-section images reconstructed via Curved Planar Reformation. This task +presents significant challenges due to the indistinct boundaries and structures +of plaques and blood vessels, leading to the inadequate performance of current +deep learning models, compounded by the inherent difficulty in annotating such +complex data. To address these issues, we propose a novel dual-consistency +semi-supervised framework that integrates Intra-frame Topological Consistency +(ITC) and Cross-frame Topological Consistency (CTC) to leverage labeled and +unlabeled data. ITC employs a dual-task network for simultaneous segmentation +mask and Skeleton-aware Distance Transform (SDT) prediction, achieving similar +prediction of topology structure through consistency constraint without +additional annotations. Meanwhile, CTC utilizes an unsupervised estimator for +analyzing pixel flow between skeletons and boundaries of adjacent frames, +ensuring spatial continuity. Experiments on two CTA datasets show that our +method surpasses existing semi-supervised methods and approaches the +performance of supervised methods on CAA. In addition, our method also performs +better than other methods on the ACDC dataset, demonstrating its +generalization. + +
+
+ comment: Accepted by ICASSP 2025 +
+
+
+
+
+ + ☆ 3UR-LLM: An End-to-End Multimodal Large Language Model for 3D Scene + Understanding + + +
+ Multi-modal Large Language Models (MLLMs) exhibit impressive capabilities in +2D tasks, yet encounter challenges in discerning the spatial positions, +interrelations, and causal logic in scenes when transitioning from 2D to 3D +representations. We find that the limitations mainly lie in: i) the high +annotation cost restricting the scale-up of volumes of 3D scene data, and ii) +the lack of a straightforward and effective way to perceive 3D information +which results in prolonged training durations and complicates the streamlined +framework. To this end, we develop pipeline based on open-source 2D MLLMs and +LLMs to generate high-quality 3D-text pairs and construct 3DS-160K , to enhance +the pre-training process. Leveraging this high-quality pre-training data, we +introduce the 3UR-LLM model, an end-to-end 3D MLLM designed for precise +interpretation of 3D scenes, showcasing exceptional capability in navigating +the complexities of the physical world. 3UR-LLM directly receives 3D point +cloud as input and project 3D features fused with text instructions into a +manageable set of tokens. Considering the computation burden derived from these +hybrid tokens, we design a 3D compressor module to cohesively compress the 3D +spatial cues and textual narrative. 3UR-LLM achieves promising performance with +respect to the previous SOTAs, for instance, 3UR-LLM exceeds its counterparts +by 7.1\% CIDEr on ScanQA, while utilizing fewer training resources. The code +and model weights for 3UR-LLM and the 3DS-160K benchmark are available at +3UR-LLM. + +
+
+ comment: Accepted to IEEE Transactions on Multimedia (TMM) +
+
+
+
+
+ + ☆ AVS-Mamba: Exploring Temporal and Multi-modal Mamba for Audio-Visual + Segmentation + + +
+ The essence of audio-visual segmentation (AVS) lies in locating and +delineating sound-emitting objects within a video stream. While +Transformer-based methods have shown promise, their handling of long-range +dependencies struggles due to quadratic computational costs, presenting a +bottleneck in complex scenarios. To overcome this limitation and facilitate +complex multi-modal comprehension with linear complexity, we introduce +AVS-Mamba, a selective state space model to address the AVS task. Our framework +incorporates two key components for video understanding and cross-modal +learning: Temporal Mamba Block for sequential video processing and +Vision-to-Audio Fusion Block for advanced audio-vision integration. Building on +this, we develop the Multi-scale Temporal Encoder, aimed at enhancing the +learning of visual features across scales, facilitating the perception of +intra- and inter-frame information. To perform multi-modal fusion, we propose +the Modality Aggregation Decoder, leveraging the Vision-to-Audio Fusion Block +to integrate visual features into audio features across both frame and temporal +levels. Further, we adopt the Contextual Integration Pyramid to perform +audio-to-vision spatial-temporal context collaboration. Through these +innovative contributions, our approach achieves new state-of-the-art results on +the AVSBench-object and AVSBench-semantic datasets. Our source code and model +weights are available at AVS-Mamba. + +
+
+ comment: Accepted to IEEE Transactions on Multimedia (TMM) +
+
+
+
+
+ + ☆ A Low-cost and Ultra-lightweight Binary Neural Network for Traffic + Signal Recognition + + +
+ The deployment of neural networks in vehicle platforms and wearable +Artificial Intelligence-of-Things (AIOT) scenarios has become a research area +that has attracted much attention. With the continuous evolution of deep +learning technology, many image classification models are committed to +improving recognition accuracy, but this is often accompanied by problems such +as large model resource usage, complex structure, and high power consumption, +which makes it challenging to deploy on resource-constrained platforms. Herein, +we propose an ultra-lightweight binary neural network (BNN) model designed for +hardware deployment, and conduct image classification research based on the +German Traffic Sign Recognition Benchmark (GTSRB) dataset. In addition, we also +verify it on the Chinese Traffic Sign (CTS) and Belgian Traffic Sign (BTS) +datasets. The proposed model shows excellent recognition performance with an +accuracy of up to 97.64%, making it one of the best performing BNN models in +the GTSRB dataset. Compared with the full-precision model, the accuracy loss is +controlled within 1%, and the parameter storage overhead of the model is only +10% of that of the full-precision model. More importantly, our network model +only relies on logical operations and low-bit width fixed-point addition and +subtraction operations during the inference phase, which greatly simplifies the +design complexity of the processing element (PE). Our research shows the great +potential of BNN in the hardware deployment of computer vision models, +especially in the field of computer vision tasks related to autonomous driving. + +
+
+
+
+
+ + ☆ Learning Motion and Temporal Cues for Unsupervised Video Object + Segmentation + + +
+ In this paper, we address the challenges in unsupervised video object +segmentation (UVOS) by proposing an efficient algorithm, termed MTNet, which +concurrently exploits motion and temporal cues. Unlike previous methods that +focus solely on integrating appearance with motion or on modeling temporal +relations, our method combines both aspects by integrating them within a +unified framework. MTNet is devised by effectively merging appearance and +motion features during the feature extraction process within encoders, +promoting a more complementary representation. To capture the intricate +long-range contextual dynamics and information embedded within videos, a +temporal transformer module is introduced, facilitating efficacious inter-frame +interactions throughout a video clip. Furthermore, we employ a cascade of +decoders all feature levels across all feature levels to optimally exploit the +derived features, aiming to generate increasingly precise segmentation masks. +As a result, MTNet provides a strong and compact framework that explores both +temporal and cross-modality knowledge to robustly localize and track the +primary object accurately in various challenging scenarios efficiently. +Extensive experiments across diverse benchmarks conclusively show that our +method not only attains state-of-the-art performance in unsupervised video +object segmentation but also delivers competitive results in video salient +object detection. These findings highlight the method's robust versatility and +its adeptness in adapting to a range of segmentation tasks. Source code is +available on https://github.com/hy0523/MTNet. + +
+
+ comment: Accepted to IEEE Transactions on Neural Networks and Learning Systems + (TNNLS) +
+
+
+
+
+ + ☆ Balance Divergence for Knowledge Distillation + + +
+ Knowledge distillation has been widely adopted in computer vision task +processing, since it can effectively enhance the performance of lightweight +student networks by leveraging the knowledge transferred from cumbersome +teacher networks. Most existing knowledge distillation methods utilize +Kullback-Leibler divergence to mimic the logit output probabilities between the +teacher network and the student network. Nonetheless, these methods may neglect +the negative parts of the teacher's ''dark knowledge'' because the divergence +calculations may ignore the effect of the minute probabilities from the +teacher's logit output. This deficiency may lead to suboptimal performance in +logit mimicry during the distillation process and result in an imbalance of +information acquired by the student network. In this paper, we investigate the +impact of this imbalance and propose a novel method, named Balance Divergence +Distillation. By introducing a compensatory operation using reverse +Kullback-Leibler divergence, our method can improve the modeling of the +extremely small values in the negative from the teacher and preserve the +learning capacity for the positive. Furthermore, we test the impact of +different temperature coefficients adjustments, which may conducted to further +balance for knowledge transferring. We evaluate the proposed method on several +computer vision tasks, including image classification and semantic +segmentation. The evaluation results show that our method achieves an accuracy +improvement of 1%~3% for lightweight students on both CIFAR-100 and ImageNet +dataset, and a 4.55% improvement in mIoU for PSP-ResNet18 on the Cityscapes +dataset. The experiments show that our method is a simple yet highly effective +solution that can be smoothly applied to different knowledge distillation +methods. + +
+
+
+
+
+ + ☆ BioPose: Biomechanically-accurate 3D Pose Estimation from Monocular + Videos + + +
+ Recent advancements in 3D human pose estimation from single-camera images and +videos have relied on parametric models, like SMPL. However, these models +oversimplify anatomical structures, limiting their accuracy in capturing true +joint locations and movements, which reduces their applicability in +biomechanics, healthcare, and robotics. Biomechanically accurate pose +estimation, on the other hand, typically requires costly marker-based motion +capture systems and optimization techniques in specialized labs. To bridge this +gap, we propose BioPose, a novel learning-based framework for predicting +biomechanically accurate 3D human pose directly from monocular videos. BioPose +includes three key components: a Multi-Query Human Mesh Recovery model +(MQ-HMR), a Neural Inverse Kinematics (NeurIK) model, and a 2D-informed pose +refinement technique. MQ-HMR leverages a multi-query deformable transformer to +extract multi-scale fine-grained image features, enabling precise human mesh +recovery. NeurIK treats the mesh vertices as virtual markers, applying a +spatial-temporal network to regress biomechanically accurate 3D poses under +anatomical constraints. To further improve 3D pose estimations, a 2D-informed +refinement step optimizes the query tokens during inference by aligning the 3D +structure with 2D pose observations. Experiments on benchmark datasets +demonstrate that BioPose significantly outperforms state-of-the-art methods. +Project website: +\url{https://m-usamasaleem.github.io/publication/BioPose/BioPose.html}. + +
+
+
+
+
+ + ☆ Parameter-Inverted Image Pyramid Networks for Visual Perception and + Multimodal Understanding + + +
+ Image pyramids are widely adopted in top-performing methods to obtain +multi-scale features for precise visual perception and understanding. However, +current image pyramids use the same large-scale model to process multiple +resolutions of images, leading to significant computational cost. To address +this challenge, we propose a novel network architecture, called +Parameter-Inverted Image Pyramid Networks (PIIP). Specifically, PIIP uses +pretrained models (ViTs or CNNs) as branches to process multi-scale images, +where images of higher resolutions are processed by smaller network branches to +balance computational cost and performance. To integrate information from +different spatial scales, we further propose a novel cross-branch feature +interaction mechanism. To validate PIIP, we apply it to various perception +models and a representative multimodal large language model called LLaVA, and +conduct extensive experiments on various tasks such as object detection, +segmentation, image classification and multimodal understanding. PIIP achieves +superior performance compared to single-branch and existing multi-resolution +approaches with lower computational cost. When applied to InternViT-6B, a +large-scale vision foundation model, PIIP can improve its performance by 1%-2% +on detection and segmentation with only 40%-60% of the original computation, +finally achieving 60.0 box AP on MS COCO and 59.7 mIoU on ADE20K. For +multimodal understanding, our PIIP-LLaVA achieves 73.0% accuracy on TextVQA and +74.5% on MMBench with only 2.8M training data. Our code is released at +https://github.com/OpenGVLab/PIIP. + +
+
+
+
+
+ + ☆ BMIP: Bi-directional Modality Interaction Prompt Learning for VLM + + +
+ Vision-language models (VLMs) have exhibited remarkable generalization +capabilities, and prompt learning for VLMs has attracted great attention for +the ability to adapt pre-trained VLMs to specific downstream tasks. However, +existing studies mainly focus on single-modal prompts or uni-directional +modality interaction, overlooking the powerful alignment effects resulting from +the interaction between the vision and language modalities. To this end, we +propose a novel prompt learning method called +$\underline{\textbf{B}}i-directional \underline{\textbf{M}}odality +\underline{\textbf{I}}nteraction \underline{\textbf{P}}rompt (BMIP)$, which +dynamically weights bi-modal information through learning the information of +the attention layer, enhancing trainability and inter-modal consistency +compared to simple information aggregation methods. To evaluate the +effectiveness of prompt learning methods, we propose a more realistic +evaluation paradigm called open-world generalization complementing the widely +adopted cross-dataset transfer and domain generalization tasks. Comprehensive +experiments on various datasets reveal that BMIP not only outperforms current +state-of-the-art methods across all three evaluation paradigms but is also +flexible enough to be combined with other prompt-based methods for consistent +performance enhancement. + +
+
+
+
+
+ + ☆ PSReg: Prior-guided Sparse Mixture of Experts for Point Cloud + Registration + + +
+ The discriminative feature is crucial for point cloud registration. Recent +methods improve the feature discriminative by distinguishing between +non-overlapping and overlapping region points. However, they still face +challenges in distinguishing the ambiguous structures in the overlapping +regions. Therefore, the ambiguous features they extracted resulted in a +significant number of outlier matches from overlapping regions. To solve this +problem, we propose a prior-guided SMoE-based registration method to improve +the feature distinctiveness by dispatching the potential correspondences to the +same experts. Specifically, we propose a prior-guided SMoE module by fusing +prior overlap and potential correspondence embeddings for routing, assigning +tokens to the most suitable experts for processing. In addition, we propose a +registration framework by a specific combination of Transformer layer and +prior-guided SMoE module. The proposed method not only pays attention to the +importance of locating the overlapping areas of point clouds, but also commits +to finding more accurate correspondences in overlapping areas. Our extensive +experiments demonstrate the effectiveness of our method, achieving +state-of-the-art registration recall (95.7\%/79.3\%) on the 3DMatch/3DLoMatch +benchmark. Moreover, we also test the performance on ModelNet40 and demonstrate +excellent performance. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ☆ Automotive Elevation Mapping with Interferometric Synthetic Aperture + Radar + + +
+ Radar is a low-cost and ubiquitous automotive sensor, but is limited by array +resolution and sensitivity when performing direction of arrival analysis. +Synthetic Aperture Radar (SAR) is a class of techniques to improve azimuth +resolution and sensitivity for radar. Interferometric SAR (InSAR) can be used +to extract elevation from the variations in phase measurements in SAR images. +Utilizing InSAR we show that a typical, low-resolution radar array mounted on a +vehicle can be used to accurately localize detections in 3D space for both +urban and agricultural environments. We generate point clouds in each +environment by combining InSAR with a signal processing scheme tailored to +automotive driving. This low-compute approach allows radar to be used as a +primary sensor to map fine details in complex driving environments, and be used +to make autonomous perception decisions. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ FLAVARS: A Multimodal Foundational Language and Vision Alignment Model + for Remote Sensing + + +
+ Remote sensing imagery is dense with objects and contextual visual +information. There is a recent trend to combine paired satellite images and +text captions for pretraining performant encoders for downstream tasks. +However, while contrastive image-text methods like CLIP enable vision-language +alignment and zero-shot classification ability, vision-only downstream +performance tends to degrade compared to image-only pretraining, such as MAE. +In this paper, we propose FLAVARS, a pretraining method that combines the best +of both contrastive learning and masked modeling, along with geospatial +alignment via contrastive location encoding. We find that FLAVARS significantly +outperforms a baseline of SkyCLIP for vision-only tasks such as KNN +classification and semantic segmentation, +6\% mIOU on SpaceNet1, while +retaining the ability to perform zero-shot classification, unlike MAE +pretrained methods. + +
+
+
+
+
+ + ☆ Benchmarking Classical, Deep, and Generative Models for Human Activity + Recognition + + +
+ Human Activity Recognition (HAR) has gained significant importance with the +growing use of sensor-equipped devices and large datasets. This paper evaluates +the performance of three categories of models : classical machine learning, +deep learning architectures, and Restricted Boltzmann Machines (RBMs) using +five key benchmark datasets of HAR (UCI-HAR, OPPORTUNITY, PAMAP2, WISDM, and +Berkeley MHAD). We assess various models, including Decision Trees, Random +Forests, Convolutional Neural Networks (CNN), and Deep Belief Networks (DBNs), +using metrics such as accuracy, precision, recall, and F1-score for a +comprehensive comparison. The results show that CNN models offer superior +performance across all datasets, especially on the Berkeley MHAD. Classical +models like Random Forest do well on smaller datasets but face challenges with +larger, more complex data. RBM-based models also show notable potential, +particularly for feature learning. This paper offers a detailed comparison to +help researchers choose the most suitable model for HAR tasks. + +
+
+ comment: 48 pages, 21 Figures +
+
+
+
+
+ + ☆ Detecting Contextual Anomalies by Discovering Consistent Spatial Regions + + +
+ We describe a method for modeling spatial context to enable video anomaly +detection. The main idea is to discover regions that share similar object-level +activities by clustering joint object attributes using Gaussian mixture models. +We demonstrate that this straightforward approach, using orders of magnitude +fewer parameters than competing models, achieves state-of-the-art performance +in the challenging spatial-context-dependent Street Scene dataset. As a side +benefit, the high-resolution discovered regions learned by the model also +provide explainable normalcy maps for human operators without the need for any +pre-trained segmentation model. + +
+
+
+
+
+ + ☆ Predicting Performance of Object Detection Models in Electron Microscopy + Using Random Forests + + +
+ Quantifying prediction uncertainty when applying object detection models to +new, unlabeled datasets is critical in applied machine learning. This study +introduces an approach to estimate the performance of deep learning-based +object detection models for quantifying defects in transmission electron +microscopy (TEM) images, focusing on detecting irradiation-induced cavities in +TEM images of metal alloys. We developed a random forest regression model that +predicts the object detection F1 score, a statistical metric used to evaluate +the ability to accurately locate and classify objects of interest. The random +forest model uses features extracted from the predictions of the object +detection model whose uncertainty is being quantified, enabling fast prediction +on new, unlabeled images. The mean absolute error (MAE) for predicting F1 of +the trained model on test data is 0.09, and the $R^2$ score is 0.77, indicating +there is a significant correlation between the random forest regression model +predicted and true defect detection F1 scores. The approach is shown to be +robust across three distinct TEM image datasets with varying imaging and +material domains. Our approach enables users to estimate the reliability of a +defect detection and segmentation model predictions and assess the +applicability of the model to their specific datasets, providing valuable +information about possible domain shifts and whether the model needs to be +fine-tuned or trained on additional data to be maximally effective for the +desired use case. + +
+
+ comment: 14 pages, 9 figures, 3 tables +
+
+
+
+
+ + ☆ Towards Zero-Shot & Explainable Video Description by Reasoning over + Graphs of Events in Space and Time + + +
+ In the current era of Machine Learning, Transformers have become the de facto +approach across a variety of domains, such as computer vision and natural +language processing. Transformer-based solutions are the backbone of current +state-of-the-art methods for language generation, image and video +classification, segmentation, action and object recognition, among many others. +Interestingly enough, while these state-of-the-art methods produce impressive +results in their respective domains, the problem of understanding the +relationship between vision and language is still beyond our reach. In this +work, we propose a common ground between vision and language based on events in +space and time in an explainable and programmatic way, to connect +learning-based vision and language state of the art models and provide a +solution to the long standing problem of describing videos in natural language. +We validate that our algorithmic approach is able to generate coherent, rich +and relevant textual descriptions on videos collected from a variety of +datasets, using both standard metrics (e.g. Bleu, ROUGE) and the modern +LLM-as-a-Jury approach. + +
+
+
+
+
+ + ☆ RWKV-UNet: Improving UNet with Long-Range Cooperation for Effective + Medical Image Segmentation + + +
+ In recent years, there have been significant advancements in deep learning +for medical image analysis, especially with convolutional neural networks +(CNNs) and transformer models. However, CNNs face limitations in capturing +long-range dependencies while transformers suffer high computational +complexities. To address this, we propose RWKV-UNet, a novel model that +integrates the RWKV (Receptance Weighted Key Value) structure into the U-Net +architecture. This integration enhances the model's ability to capture +long-range dependencies and improve contextual understanding, which is crucial +for accurate medical image segmentation. We build a strong encoder with +developed inverted residual RWKV (IR-RWKV) blocks combining CNNs and RWKVs. We +also propose a Cross-Channel Mix (CCM) module to improve skip connections with +multi-scale feature fusion, achieving global channel information integration. +Experiments on benchmark datasets, including Synapse, ACDC, BUSI, CVC-ClinicDB, +CVC-ColonDB, Kvasir-SEG, ISIC 2017 and GLAS show that RWKV-UNet achieves +state-of-the-art performance on various types of medical image segmentation. +Additionally, smaller variants, RWKV-UNet-S and RWKV-UNet-T, balance accuracy +and computational efficiency, making them suitable for broader clinical +applications. + +
+
+
+
+
+ + ☆ Vchitect-2.0: Parallel Transformer for Scaling Up Video Diffusion Models + + +
+ We present Vchitect-2.0, a parallel transformer architecture designed to +scale up video diffusion models for large-scale text-to-video generation. The +overall Vchitect-2.0 system has several key designs. (1) By introducing a novel +Multimodal Diffusion Block, our approach achieves consistent alignment between +text descriptions and generated video frames, while maintaining temporal +coherence across sequences. (2) To overcome memory and computational +bottlenecks, we propose a Memory-efficient Training framework that incorporates +hybrid parallelism and other memory reduction techniques, enabling efficient +training of long video sequences on distributed systems. (3) Additionally, our +enhanced data processing pipeline ensures the creation of Vchitect T2V +DataVerse, a high-quality million-scale training dataset through rigorous +annotation and aesthetic evaluation. Extensive benchmarking demonstrates that +Vchitect-2.0 outperforms existing methods in video quality, training +efficiency, and scalability, serving as a suitable base for high-fidelity video +generation. + +
+
+
+
+
+ + ☆ Poseidon: A ViT-based Architecture for Multi-Frame Pose Estimation with + Adaptive Frame Weighting and Multi-Scale Feature Fusion + + +
+ Human pose estimation, a vital task in computer vision, involves detecting +and localising human joints in images and videos. While single-frame pose +estimation has seen significant progress, it often fails to capture the +temporal dynamics for understanding complex, continuous movements. We propose +Poseidon, a novel multi-frame pose estimation architecture that extends the +ViTPose model by integrating temporal information for enhanced accuracy and +robustness to address these limitations. Poseidon introduces key innovations: +(1) an Adaptive Frame Weighting (AFW) mechanism that dynamically prioritises +frames based on their relevance, ensuring that the model focuses on the most +informative data; (2) a Multi-Scale Feature Fusion (MSFF) module that +aggregates features from different backbone layers to capture both fine-grained +details and high-level semantics; and (3) a Cross-Attention module for +effective information exchange between central and contextual frames, enhancing +the model's temporal coherence. The proposed architecture improves performance +in complex video scenarios and offers scalability and computational efficiency +suitable for real-world applications. Our approach achieves state-of-the-art +performance on the PoseTrack21 and PoseTrack18 datasets, achieving mAP scores +of 88.3 and 87.8, respectively, outperforming existing methods. + +
+
+
+
+
+ + ☆ FARE: A Deep Learning-Based Framework for Radar-based Face Recognition + and Out-of-distribution Detection + + +
+ In this work, we propose a novel pipeline for face recognition and +out-of-distribution (OOD) detection using short-range FMCW radar. The proposed +system utilizes Range-Doppler and micro Range-Doppler Images. The architecture +features a primary path (PP) responsible for the classification of +in-distribution (ID) faces, complemented by intermediate paths (IPs) dedicated +to OOD detection. The network is trained in two stages: first, the PP is +trained using triplet loss to optimize ID face classification. In the second +stage, the PP is frozen, and the IPs-comprising simple linear autoencoder +networks-are trained specifically for OOD detection. Using our dataset +generated with a 60 GHz FMCW radar, our method achieves an ID classification +accuracy of 99.30% and an OOD detection AUROC of 96.91%. + +
+
+ comment: Accepted at ICASSP 2025 +
+
+
+
+
+ + ☆ Cross-Modal Transferable Image-to-Video Attack on Video Quality Metrics + + +
+ Recent studies have revealed that modern image and video quality assessment +(IQA/VQA) metrics are vulnerable to adversarial attacks. An attacker can +manipulate a video through preprocessing to artificially increase its quality +score according to a certain metric, despite no actual improvement in visual +quality. Most of the attacks studied in the literature are white-box attacks, +while black-box attacks in the context of VQA have received less attention. +Moreover, some research indicates a lack of transferability of adversarial +examples generated for one model to another when applied to VQA. In this paper, +we propose a cross-modal attack method, IC2VQA, aimed at exploring the +vulnerabilities of modern VQA models. This approach is motivated by the +observation that the low-level feature spaces of images and videos are similar. +We investigate the transferability of adversarial perturbations across +different modalities; specifically, we analyze how adversarial perturbations +generated on a white-box IQA model with an additional CLIP module can +effectively target a VQA model. The addition of the CLIP module serves as a +valuable aid in increasing transferability, as the CLIP model is known for its +effective capture of low-level semantics. Extensive experiments demonstrate +that IC2VQA achieves a high success rate in attacking three black-box VQA +models. We compare our method with existing black-box attack strategies, +highlighting its superiority in terms of attack success within the same number +of iterations and levels of attack strength. We believe that the proposed +method will contribute to the deeper analysis of robust VQA metrics. + +
+
+ comment: Accepted for VISAPP 2025 +
+
+
+
+
+ + ☆ BiDepth Multimodal Neural Network: Bidirectional Depth Deep Learning + Arcitecture for Spatial-Temporal Prediction + + +
+ Accurate prediction of spatial-temporal (ST) information in dynamic systems, +such as urban mobility and weather patterns, is a crucial yet challenging +problem. The complexity stems from the intricate interplay between spatial +proximity and temporal relevance, where both long-term trends and short-term +fluctuations are present in convoluted patterns. Existing approaches, including +traditional statistical methods and conventional neural networks, may provide +inaccurate results due to the lack of an effective mechanism that +simultaneously incorporates information at variable temporal depths while +maintaining spatial context, resulting in a trade-off between comprehensive +long-term historical analysis and responsiveness to short-term new information. +To bridge this gap, this paper proposes the BiDepth Multimodal Neural Network +(BDMNN) with bidirectional depth modulation that enables a comprehensive +understanding of both long-term seasonality and short-term fluctuations, +adapting to the complex ST context. Case studies with real-world public data +demonstrate significant improvements in prediction accuracy, with a 12% +reduction in Mean Squared Error for urban traffic prediction and a 15% +improvement in rain precipitation forecasting compared to state-of-the-art +benchmarks, without demanding extra computational resources. + +
+
+ comment: This paper has been submitted to Applied Intelligence for review +
+
+
+
+
+ + ☆ Leveraging 2D Masked Reconstruction for Domain Adaptation of 3D Pose + Estimation + + +
+ RGB-based 3D pose estimation methods have been successful with the +development of deep learning and the emergence of high-quality 3D pose +datasets. However, most existing methods do not operate well for testing images +whose distribution is far from that of training data. However, most existing +methods do not operate well for testing images whose distribution is far from +that of training data. This problem might be alleviated by involving diverse +data during training, however it is non-trivial to collect such diverse data +with corresponding labels (i.e. 3D pose). In this paper, we introduced an +unsupervised domain adaptation framework for 3D pose estimation that utilizes +the unlabeled data in addition to labeled data via masked image modeling (MIM) +framework. Foreground-centric reconstruction and attention regularization are +further proposed to increase the effectiveness of unlabeled data usage. +Experiments are conducted on the various datasets in human and hand pose +estimation tasks, especially using the cross-domain scenario. We demonstrated +the effectiveness of ours by achieving the state-of-the-art accuracy on all +datasets. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ☆ 3D Gaussian Splatting with Normal Information for Mesh Extraction and + Improved Rendering + + +
+ Differentiable 3D Gaussian splatting has emerged as an efficient and flexible +rendering technique for representing complex scenes from a collection of 2D +views and enabling high-quality real-time novel-view synthesis. However, its +reliance on photometric losses can lead to imprecisely reconstructed geometry +and extracted meshes, especially in regions with high curvature or fine detail. +We propose a novel regularization method using the gradients of a signed +distance function estimated from the Gaussians, to improve the quality of +rendering while also extracting a surface mesh. The regularizing normal +supervision facilitates better rendering and mesh reconstruction, which is +crucial for downstream applications in video generation, animation, AR-VR and +gaming. We demonstrate the effectiveness of our approach on datasets such as +Mip-NeRF360, Tanks and Temples, and Deep-Blending. Our method scores higher on +photorealism metrics compared to other mesh extracting rendering methods +without compromising mesh quality. + +
+
+ comment: ICASSP 2025: Workshop on Generative Data Augmentation for Real-World + Signal Processing Applications +
+
+
+
+
+ + ♻ ☆ Rate-In: Information-Driven Adaptive Dropout Rates for Improved + Inference-Time Uncertainty Estimation + + +
+ Accurate uncertainty estimation is crucial for deploying neural networks in +risk-sensitive applications such as medical diagnosis. Monte Carlo Dropout is a +widely used technique for approximating predictive uncertainty by performing +stochastic forward passes with dropout during inference. However, using static +dropout rates across all layers and inputs can lead to suboptimal uncertainty +estimates, as it fails to adapt to the varying characteristics of individual +inputs and network layers. Existing approaches optimize dropout rates during +training using labeled data, resulting in fixed inference-time parameters that +cannot adjust to new data distributions, compromising uncertainty estimates in +Monte Carlo simulations. + In this paper, we propose Rate-In, an algorithm that dynamically adjusts +dropout rates during inference by quantifying the information loss induced by +dropout in each layer's feature maps. By treating dropout as controlled noise +injection and leveraging information-theoretic principles, Rate-In adapts +dropout rates per layer and per input instance without requiring ground truth +labels. By quantifying the functional information loss in feature maps, we +adaptively tune dropout rates to maintain perceptual quality across diverse +medical imaging tasks and architectural configurations. Our extensive empirical +study on synthetic data and real-world medical imaging tasks demonstrates that +Rate-In improves calibration and sharpens uncertainty estimates compared to +fixed or heuristic dropout rates without compromising predictive performance. +Rate-In offers a practical, unsupervised, inference-time approach to optimizing +dropout for more reliable predictive uncertainty estimation in critical +applications. + +
+
+ comment: Updated author affiliation +
+
+
+
+
+ + ♻ ☆ Gaussian Eigen Models for Human Heads + + +
+ Current personalized neural head avatars face a trade-off: lightweight models +lack detail and realism, while high-quality, animatable avatars require +significant computational resources, making them unsuitable for commodity +devices. To address this gap, we introduce Gaussian Eigen Models (GEM), which +provide high-quality, lightweight, and easily controllable head avatars. GEM +utilizes 3D Gaussian primitives for representing the appearance combined with +Gaussian splatting for rendering. Building on the success of mesh-based 3D +morphable face models (3DMM), we define GEM as an ensemble of linear eigenbases +for representing the head appearance of a specific subject. In particular, we +construct linear bases to represent the position, scale, rotation, and opacity +of the 3D Gaussians. This allows us to efficiently generate Gaussian primitives +of a specific head shape by a linear combination of the basis vectors, only +requiring a low-dimensional parameter vector that contains the respective +coefficients. We propose to construct these linear bases (GEM) by distilling +high-quality compute-intense CNN-based Gaussian avatar models that can generate +expression-dependent appearance changes like wrinkles. These high-quality +models are trained on multi-view videos of a subject and are distilled using a +series of principal component analyses. Once we have obtained the bases that +represent the animatable appearance space of a specific human, we learn a +regressor that takes a single RGB image as input and predicts the +low-dimensional parameter vector that corresponds to the shown facial +expression. In a series of experiments, we compare GEM's self-reenactment and +cross-person reenactment results to state-of-the-art 3D avatar methods, +demonstrating GEM's higher visual quality and better generalization to new +expressions. + +
+
+ comment: https://zielon.github.io/gem/ +
+
+
+
+
+ + ♻ ☆ A Multi-Modal Approach for Face Anti-Spoofing in Non-Calibrated Systems + using Disparity Maps + + +
+ Face recognition technologies are increasingly used in various applications, +yet they are vulnerable to face spoofing attacks. These spoofing attacks often +involve unique 3D structures, such as printed papers or mobile device screens. +Although stereo-depth cameras can detect such attacks effectively, their +high-cost limits their widespread adoption. Conversely, two-sensor systems +without extrinsic calibration offer a cost-effective alternative but are unable +to calculate depth using stereo techniques. In this work, we propose a method +to overcome this challenge by leveraging facial attributes to derive disparity +information and estimate relative depth for anti-spoofing purposes, using +non-calibrated systems. We introduce a multi-modal anti-spoofing model, coined +Disparity Model, that incorporates created disparity maps as a third modality +alongside the two original sensor modalities. We demonstrate the effectiveness +of the Disparity Model in countering various spoof attacks using a +comprehensive dataset collected from the Intel RealSense ID Solution F455. Our +method outperformed existing methods in the literature, achieving an Equal +Error Rate (EER) of 1.71% and a False Negative Rate (FNR) of 2.77% at a False +Positive Rate (FPR) of 1%. These errors are lower by 2.45% and 7.94% than the +errors of the best comparison method, respectively. Additionally, we introduce +a model ensemble that addresses 3D spoof attacks as well, achieving an EER of +2.04% and an FNR of 3.83% at an FPR of 1%. Overall, our work provides a +state-of-the-art solution for the challenging task of anti-spoofing in +non-calibrated systems that lack depth information. + +
+
+
+
+
+ + ♻ ☆ RMem: Restricted Memory Banks Improve Video Object Segmentation CVPR 2024 + + +
+ With recent video object segmentation (VOS) benchmarks evolving to +challenging scenarios, we revisit a simple but overlooked strategy: restricting +the size of memory banks. This diverges from the prevalent practice of +expanding memory banks to accommodate extensive historical information. Our +specially designed "memory deciphering" study offers a pivotal insight +underpinning such a strategy: expanding memory banks, while seemingly +beneficial, actually increases the difficulty for VOS modules to decode +relevant features due to the confusion from redundant information. By +restricting memory banks to a limited number of essential frames, we achieve a +notable improvement in VOS accuracy. This process balances the importance and +freshness of frames to maintain an informative memory bank within a bounded +capacity. Additionally, restricted memory banks reduce the training-inference +discrepancy in memory lengths compared with continuous expansion. This fosters +new opportunities in temporal reasoning and enables us to introduce the +previously overlooked "temporal positional embedding." Finally, our insights +are embodied in "RMem" ("R" for restricted), a simple yet effective VOS +modification that excels at challenging VOS scenarios and establishes new state +of the art for object state changes (on the VOST dataset) and long videos (on +the Long Videos dataset). Our code and demo are available at +https://restricted-memory.github.io/. + +
+
+ comment: CVPR 2024, Project Page: https://restricted-memory.github.io/ +
+
+
+
+
+ + ♻ ☆ FaVoR: Features via Voxel Rendering for Camera Relocalization + + +
+ Camera relocalization methods range from dense image alignment to direct +camera pose regression from a query image. Among these, sparse feature matching +stands out as an efficient, versatile, and generally lightweight approach with +numerous applications. However, feature-based methods often struggle with +significant viewpoint and appearance changes, leading to matching failures and +inaccurate pose estimates. To overcome this limitation, we propose a novel +approach that leverages a globally sparse yet locally dense 3D representation +of 2D features. By tracking and triangulating landmarks over a sequence of +frames, we construct a sparse voxel map optimized to render image patch +descriptors observed during tracking. Given an initial pose estimate, we first +synthesize descriptors from the voxels using volumetric rendering and then +perform feature matching to estimate the camera pose. This methodology enables +the generation of descriptors for unseen views, enhancing robustness to view +changes. We extensively evaluate our method on the 7-Scenes and Cambridge +Landmarks datasets. Our results show that our method significantly outperforms +existing state-of-the-art feature representation techniques in indoor +environments, achieving up to a 39% improvement in median translation error. +Additionally, our approach yields comparable results to other methods for +outdoor scenarios while maintaining lower memory and computational costs. + +
+
+ comment: Accepted to the IEEE/CVF Winter Conference on Applications of + Computer Vision (WACV), Tucson, Arizona, US, Feb 28-Mar 4, 2025 +
+
+
+
+
+ + ♻ ☆ Vid2Sim: Realistic and Interactive Simulation from Video for Urban + Navigation + + +
+ Sim-to-real gap has long posed a significant challenge for robot learning in +simulation, preventing the deployment of learned models in the real world. +Previous work has primarily focused on domain randomization and system +identification to mitigate this gap. However, these methods are often limited +by the inherent constraints of the simulation and graphics engines. In this +work, we propose Vid2Sim, a novel framework that effectively bridges the +sim2real gap through a scalable and cost-efficient real2sim pipeline for neural +3D scene reconstruction and simulation. Given a monocular video as input, +Vid2Sim can generate photorealistic and physically interactable 3D simulation +environments to enable the reinforcement learning of visual navigation agents +in complex urban environments. Extensive experiments demonstrate that Vid2Sim +significantly improves the performance of urban navigation in the digital twins +and real world by 31.2% and 68.3% in success rate compared with agents trained +with prior simulation methods. + +
+
+ comment: Project page: https://metadriverse.github.io/vid2sim/ +
+
+
+
+
+ + ♻ ☆ Deep Compression Autoencoder for Efficient High-Resolution Diffusion + Models + + +
+ We present Deep Compression Autoencoder (DC-AE), a new family of autoencoder +models for accelerating high-resolution diffusion models. Existing autoencoder +models have demonstrated impressive results at a moderate spatial compression +ratio (e.g., 8x), but fail to maintain satisfactory reconstruction accuracy for +high spatial compression ratios (e.g., 64x). We address this challenge by +introducing two key techniques: (1) Residual Autoencoding, where we design our +models to learn residuals based on the space-to-channel transformed features to +alleviate the optimization difficulty of high spatial-compression autoencoders; +(2) Decoupled High-Resolution Adaptation, an efficient decoupled three-phases +training strategy for mitigating the generalization penalty of high +spatial-compression autoencoders. With these designs, we improve the +autoencoder's spatial compression ratio up to 128 while maintaining the +reconstruction quality. Applying our DC-AE to latent diffusion models, we +achieve significant speedup without accuracy drop. For example, on ImageNet +512x512, our DC-AE provides 19.1x inference speedup and 17.9x training speedup +on H100 GPU for UViT-H while achieving a better FID, compared with the widely +used SD-VAE-f8 autoencoder. Our code is available at +https://github.com/mit-han-lab/efficientvit. + +
+
+ comment: Preprint. First two authors contributed equally to this work. Update: + add USiT (UViT+SiT sampler) results +
+
+
+
+
+ + ♻ ☆ Scaling White-Box Transformers for Vision + + +
+ CRATE, a white-box transformer architecture designed to learn compressed and +sparse representations, offers an intriguing alternative to standard vision +transformers (ViTs) due to its inherent mathematical interpretability. Despite +extensive investigations into the scaling behaviors of language and vision +transformers, the scalability of CRATE remains an open question which this +paper aims to address. Specifically, we propose CRATE-$\alpha$, featuring +strategic yet minimal modifications to the sparse coding block in the CRATE +architecture design, and a light training recipe designed to improve the +scalability of CRATE. Through extensive experiments, we demonstrate that +CRATE-$\alpha$ can effectively scale with larger model sizes and datasets. For +example, our CRATE-$\alpha$-B substantially outperforms the prior best CRATE-B +model accuracy on ImageNet classification by 3.7%, achieving an accuracy of +83.2%. Meanwhile, when scaling further, our CRATE-$\alpha$-L obtains an +ImageNet classification accuracy of 85.1%. More notably, these model +performance improvements are achieved while preserving, and potentially even +enhancing the interpretability of learned CRATE models, as we demonstrate +through showing that the learned token representations of increasingly larger +trained CRATE-$\alpha$ models yield increasingly higher-quality unsupervised +object segmentation of images. The project page is +https://rayjryang.github.io/CRATE-alpha/. + +
+
+ comment: project page: https://rayjryang.github.io/CRATE-alpha/ +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey of Foundation Models in Medicine + + +
+ Foundation models (FMs) are large-scale deep learning models that are +developed using large datasets and self-supervised learning methods. These +models serve as a base for different downstream tasks, including healthcare. +FMs have been adopted with great success across various domains within +healthcare. Existing healthcare-based surveys have not yet included all of +these domains. Therefore, we provide a detailed survey of FMs in healthcare. We +focus on the history, learning strategies, flagship models, applications, and +challenges of FMs. We explore how FMs such as the BERT and GPT families are +reshaping various healthcare domains, including clinical large language models, +medical image analysis, and omics. Furthermore, we provide a detailed taxonomy +of healthcare applications facilitated by FMs, such as clinical NLP, medical +computer vision, graph learning, and other biology-related tasks. Despite the +promising opportunities FMs provide, they also have several associated +challenges, which are explained in detail. We also outline open research issues +and potential lessons learned to provide researchers and practitioners with +insights into the capabilities of FMs in healthcare to advance their deployment +and mitigate associated risks. + +
+
+ comment: Currently under review in IEEE REVIEWS IN BIOMEDICAL ENGINEERING +
+
+
+
+
+ + ♻ ☆ Text-guided Image Restoration and Semantic Enhancement for Text-to-Image + Person Retrieval + + +
+ The goal of Text-to-Image Person Retrieval (TIPR) is to retrieve specific +person images according to the given textual descriptions. A primary challenge +in this task is bridging the substantial representational gap between visual +and textual modalities. The prevailing methods map texts and images into +unified embedding space for matching, while the intricate semantic +correspondences between texts and images are still not effectively constructed. +To address this issue, we propose a novel TIPR framework to build fine-grained +interactions and alignment between person images and the corresponding texts. +Specifically, via fine-tuning the Contrastive Language-Image Pre-training +(CLIP) model, a visual-textual dual encoder is firstly constructed, to +preliminarily align the image and text features. Secondly, a Text-guided Image +Restoration (TIR) auxiliary task is proposed to map abstract textual entities +to specific image regions, improving the alignment between local textual and +visual embeddings. Additionally, a cross-modal triplet loss is presented to +handle hard samples, and further enhance the model's discriminability for minor +differences. Moreover, a pruning-based text data augmentation approach is +proposed to enhance focus on essential elements in descriptions, thereby +avoiding excessive model attention to less significant information. The +experimental results show our proposed method outperforms state-of-the-art +methods on three popular benchmark datasets, and the code will be made publicly +available at https://github.com/Delong-liu-bupt/SEN. + +
+
+ comment: The paper was withdrawn due to a dispute among the authors regarding + the content of the article +
+
+
+
+
+ + ♻ ☆ Relaxed Rotational Equivariance via $G$-Biases in Vision + + +
+ Group Equivariant Convolution (GConv) can capture rotational equivariance +from original data. It assumes uniform and strict rotational equivariance +across all features as the transformations under the specific group. However, +the presentation or distribution of real-world data rarely conforms to strict +rotational equivariance, commonly referred to as Rotational Symmetry-Breaking +(RSB) in the system or dataset, making GConv unable to adapt effectively to +this phenomenon. Motivated by this, we propose a simple but highly effective +method to address this problem, which utilizes a set of learnable biases called +$G$-Biases under the group order to break strict group constraints and then +achieve a Relaxed Rotational Equivariant Convolution (RREConv). To validate the +efficiency of RREConv, we conduct extensive ablation experiments on the +discrete rotational group $\mathcal{C}_n$. Experiments demonstrate that the +proposed RREConv-based methods achieve excellent performance compared to +existing GConv-based methods in both classification and 2D object detection +tasks on the natural image datasets. + +
+
+
+
+
+ + ♻ ☆ Feedback-driven object detection and iterative model improvement + + +
+ Automated object detection has become increasingly valuable across diverse +applications, yet efficient, high-quality annotation remains a persistent +challenge. In this paper, we present the development and evaluation of a +platform designed to interactively improve object detection models. The +platform allows uploading and annotating images as well as fine-tuning object +detection models. Users can then manually review and refine annotations, +further creating improved snapshots that are used for automatic object +detection on subsequent image uploads - a process we refer to as semi-automatic +annotation resulting in a significant gain in annotation efficiency. + Whereas iterative refinement of model results to speed up annotation has +become common practice, we are the first to quantitatively evaluate its +benefits with respect to time, effort, and interaction savings. Our +experimental results show clear evidence for a significant time reduction of up +to 53% for semi-automatic compared to manual annotation. Importantly, these +efficiency gains did not compromise annotation quality, while matching or +occasionally even exceeding the accuracy of manual annotations. These findings +demonstrate the potential of our lightweight annotation platform for creating +high-quality object detection datasets and provide best practices to guide +future development of annotation platforms. + The platform is open-source, with the frontend and backend repositories +available on GitHub (https://github.com/ml-lab-htw/iterative-annotate). To +support the understanding of our labeling process, we have created an +explanatory video demonstrating the methodology using microscopy images of E. +coli bacteria as an example. The video is available on YouTube +(https://www.youtube.com/watch?v=CM9uhE8NN5E). + +
+
+ comment: AI4EA24 +
+
+
+
+
+ + ♻ ☆ ORFormer: Occlusion-Robust Transformer for Accurate Facial Landmark + Detection + + +
+ Although facial landmark detection (FLD) has gained significant progress, +existing FLD methods still suffer from performance drops on partially +non-visible faces, such as faces with occlusions or under extreme lighting +conditions or poses. To address this issue, we introduce ORFormer, a novel +transformer-based method that can detect non-visible regions and recover their +missing features from visible parts. Specifically, ORFormer associates each +image patch token with one additional learnable token called the messenger +token. The messenger token aggregates features from all but its patch. This +way, the consensus between a patch and other patches can be assessed by +referring to the similarity between its regular and messenger embeddings, +enabling non-visible region identification. Our method then recovers occluded +patches with features aggregated by the messenger tokens. Leveraging the +recovered features, ORFormer compiles high-quality heatmaps for the downstream +FLD task. Extensive experiments show that our method generates heatmaps +resilient to partial occlusions. By integrating the resultant heatmaps into +existing FLD methods, our method performs favorably against the state of the +arts on challenging datasets such as WFLW and COFW. + +
+
+ comment: WACV 2025 Project Link: https://ben0919.github.io/ORFormer/ +
+
+
+
+
+ + ♻ ☆ Diversified Augmentation with Domain Adaptation for Debiased Video + Temporal Grounding + + +
+ Temporal sentence grounding in videos (TSGV) faces challenges due to public +TSGV datasets containing significant temporal biases, which are attributed to +the uneven temporal distributions of target moments. Existing methods generate +augmented videos, where target moments are forced to have varying temporal +locations. However, since the video lengths of the given datasets have small +variations, only changing the temporal locations results in poor generalization +ability in videos with varying lengths. In this paper, we propose a novel +training framework complemented by diversified data augmentation and a domain +discriminator. The data augmentation generates videos with various lengths and +target moment locations to diversify temporal distributions. However, augmented +videos inevitably exhibit distinct feature distributions which may introduce +noise. To address this, we design a domain adaptation auxiliary task to +diminish feature discrepancies between original and augmented videos. We also +encourage the model to produce distinct predictions for videos with the same +text queries but different moment locations to promote debiased training. +Experiments on Charades-CD and ActivityNet-CD datasets demonstrate the +effectiveness and generalization abilities of our method in multiple grounding +structures, achieving state-of-the-art results. + +
+
+ comment: Accepted by ICASSP 2025 +
+
+
+
+
+ + ♻ ☆ MSCViT: A Small-size ViT architecture with Multi-Scale Self-Attention + Mechanism for Tiny Datasets + + +
+ Vision Transformer (ViT) has demonstrated significant potential in various +vision tasks due to its strong ability in modelling long-range dependencies. +However, such success is largely fueled by training on massive samples. In real +applications, the large-scale datasets are not always available, and ViT +performs worse than Convolutional Neural Networks (CNNs) if it is only trained +on small scale dataset (called tiny dataset), since it requires large amount of +training data to ensure its representational capacity. In this paper, a +small-size ViT architecture with multi-scale self-attention mechanism and +convolution blocks is presented (dubbed MSCViT) to model different scales of +attention at each layer. Firstly, we introduced wavelet convolution, which +selectively combines the high-frequency components obtained by frequency +division with our convolution channel to extract local features. Then, a +lightweight multi-head attention module is developed to reduce the number of +tokens and computational costs. Finally, the positional encoding (PE) in the +backbone is replaced by a local feature extraction module. Compared with the +original ViT, it is parameter-efficient and is particularly suitable for tiny +datasets. Extensive experiments have been conducted on tiny datasets, in which +our model achieves an accuracy of 84.68% on CIFAR-100 with 14.0M parameters and +2.5 GFLOPs, without pre-training on large datasets. + +
+
+
+
+
+ + ♻ ☆ WINE: Wavelet-Guided GAN Inversion and Editing for High-Fidelity + Refinement + + +
+ Recent advanced GAN inversion models aim to convey high-fidelity information +from original images to generators through methods using generator tuning or +high-dimensional feature learning. Despite these efforts, accurately +reconstructing image-specific details remains as a challenge due to the +inherent limitations both in terms of training and structural aspects, leading +to a bias towards low-frequency information. In this paper, we look into the +widely used pixel loss in GAN inversion, revealing its predominant focus on the +reconstruction of low-frequency features. We then propose WINE, a +Wavelet-guided GAN Inversion aNd Editing model, which transfers the +high-frequency information through wavelet coefficients via newly proposed +wavelet loss and wavelet fusion scheme. Notably, WINE is the first attempt to +interpret GAN inversion in the frequency domain. Our experimental results +showcase the precision of WINE in preserving high-frequency details and +enhancing image quality. Even in editing scenarios, WINE outperforms existing +state-of-the-art GAN inversion models with a fine balance between editability +and reconstruction quality. + +
+
+
+
+
+ + ♻ ☆ Generalized and Efficient 2D Gaussian Splatting for Arbitrary-scale + Super-Resolution + + +
+ Equipped with the continuous representation capability of Multi-Layer +Perceptron (MLP), Implicit Neural Representation (INR) has been successfully +employed for Arbitrary-scale Super-Resolution (ASR). However, the limited +receptive field of the linear layers in MLP restricts the representation +capability of INR, while it is computationally expensive to query the MLP +numerous times to render each pixel. Recently, Gaussian Splatting (GS) has +shown its advantages over INR in both visual quality and rendering speed in 3D +tasks, which motivates us to explore whether GS can be employed for the ASR +task. However, directly applying GS to ASR is exceptionally challenging because +the original GS is an optimization-based method through overfitting each single +scene, while in ASR we aim to learn a single model that can generalize to +different images and scaling factors. We overcome these challenges by +developing two novel techniques. Firstly, to generalize GS for ASR, we +elaborately design an architecture to predict the corresponding +image-conditioned Gaussians of the input low-resolution image in a feed-forward +manner. Secondly, we implement an efficient differentiable 2D GPU/CUDA-based +scale-aware rasterization to render super-resolved images by sampling discrete +RGB values from the predicted contiguous Gaussians. Via end-to-end training, +our optimized network, namely GSASR, can perform ASR for any image and unseen +scaling factors. Extensive experiments validate the effectiveness of our +proposed method. The project page can be found at +\url{https://mt-cly.github.io/GSASR.github.io/}. + +
+
+
+
+
+ + ♻ ☆ Dynamic Sub-graph Distillation for Robust Semi-supervised Continual + Learning + + +
+ Continual learning (CL) has shown promising results and comparable +performance to learning at once in a fully supervised manner. However, CL +strategies typically require a large number of labeled samples, making their +real-life deployment challenging. In this work, we focus on semi-supervised +continual learning (SSCL), where the model progressively learns from partially +labeled data with unknown categories. We provide a comprehensive analysis of +SSCL and demonstrate that unreliable distributions of unlabeled data lead to +unstable training and refinement of the progressing stages. This problem +severely impacts the performance of SSCL. To address the limitations, we +propose a novel approach called Dynamic Sub-Graph Distillation (DSGD) for +semi-supervised continual learning, which leverages both semantic and +structural information to achieve more stable knowledge distillation on +unlabeled data and exhibit robustness against distribution bias. Firstly, we +formalize a general model of structural distillation and design a dynamic graph +construction for the continual learning progress. Next, we define a structure +distillation vector and design a dynamic sub-graph distillation algorithm, +which enables end-to-end training and adaptability to scale up tasks. The +entire proposed method is adaptable to various CL methods and supervision +settings. Finally, experiments conducted on three datasets CIFAR10, CIFAR100, +and ImageNet-100, with varying supervision ratios, demonstrate the +effectiveness of our proposed approach in mitigating the catastrophic +forgetting problem in semi-supervised continual learning scenarios. + +
+
+
+
+
+ + ♻ ☆ Less is More: The Influence of Pruning on the Explainability of CNNs + + +
+ Over the last century, deep learning models have become the state-of-the-art +for solving complex computer vision problems. These modern computer vision +models have millions of parameters, which presents two major challenges: (1) +the increased computational requirements hamper the deployment in +resource-constrained environments, such as mobile or IoT devices, and (2) +explaining the complex decisions of such networks to humans is challenging. +Network pruning is a technical approach to reduce the complexity of models, +where less important parameters are removed. The work presented in this paper +investigates whether this reduction in technical complexity also helps with +perceived explainability. To do so, we conducted a pre-study and two +human-grounded experiments, assessing the effects of different pruning ratios +on explainability. Overall, we evaluate four different compression rates (i.e., +2, 4, 8, and 32) with 37 500 tasks on Mechanical Turk. Results indicate that +lower compression rates have a positive influence on explainability, while +higher compression rates show negative effects. Furthermore, we were able to +identify sweet spots that increase both the perceived explainability and the +model's performance. + +
+
+
+
+
+ + ♻ ☆ Spurious Feature Eraser: Stabilizing Test-Time Adaptation for + Vision-Language Foundation Model + + +
+ Vision-language foundation models have exhibited remarkable success across a +multitude of downstream tasks due to their scalability on extensive image-text +paired data. However, these models also display significant limitations when +applied to downstream tasks, such as fine-grained image classification, as a +result of ``decision shortcuts'' that hinder their generalization capabilities. +In this work, we find that the CLIP model possesses a rich set of features, +encompassing both \textit{desired invariant causal features} and +\textit{undesired decision shortcuts}. Moreover, the underperformance of CLIP +on downstream tasks originates from its inability to effectively utilize +pre-trained features in accordance with specific task requirements. To address +this challenge, we propose a simple yet effective method, Spurious Feature +Eraser (SEraser), to alleviate the decision shortcuts by erasing the spurious +features. Specifically, we introduce a test-time prompt tuning paradigm that +optimizes a learnable prompt, thereby compelling the model to exploit invariant +features while disregarding decision shortcuts during the inference phase. The +proposed method effectively alleviates excessive dependence on potentially +misleading spurious information. We conduct comparative analysis of the +proposed method against various approaches which validates the significant +superiority. + +
+
+
+
+
+ + ♻ ☆ ImagiNet: A Multi-Content Benchmark for Synthetic Image Detection + + +
+ Recent generative models produce images with a level of authenticity that +makes them nearly indistinguishable from real photos and artwork. Potential +harmful use cases of these models, necessitate the creation of robust synthetic +image detectors. However, current datasets in the field contain generated +images with questionable quality or have examples from one predominant content +type which leads to poor generalizability of the underlying detectors. We find +that the curation of a balanced amount of high-resolution generated images +across various content types is crucial for the generalizability of detectors, +and introduce ImagiNet, a dataset of 200K examples, spanning four categories: +photos, paintings, faces, and miscellaneous. Synthetic images in ImagiNet are +produced with both open-source and proprietary generators, whereas real +counterparts for each content type are collected from public datasets. The +structure of ImagiNet allows for a two-track evaluation system: i) +classification as real or synthetic and ii) identification of the generative +model. To establish a strong baseline, we train a ResNet-50 model using a +self-supervised contrastive objective (SelfCon) for each track which achieves +evaluation AUC of up to 0.99 and balanced accuracy ranging from 86% to 95%, +even under conditions that involve compression and resizing. The provided model +is generalizable enough to achieve zero-shot state-of-the-art performance on +previous synthetic detection benchmarks. We provide ablations to demonstrate +the importance of content types and publish code and data. + +
+
+ comment: Workshop on Datasets and Evaluators of AI Safety, AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Digi2Real: Bridging the Realism Gap in Synthetic Data Face Recognition + via Foundation Models + + +
+ The accuracy of face recognition systems has improved significantly in the +past few years, thanks to the large amount of data collected and advancements +in neural network architectures. However, these large-scale datasets are often +collected without explicit consent, raising ethical and privacy concerns. To +address this, there have been proposals to use synthetic datasets for training +face recognition models. Yet, such models still rely on real data to train the +generative models and generally exhibit inferior performance compared to those +trained on real datasets. One of these datasets, DigiFace, uses a graphics +pipeline to generate different identities and intra-class variations without +using real data in model training. However, the performance of this approach is +poor on face recognition benchmarks, possibly due to the lack of realism in the +images generated by the graphics pipeline. In this work, we introduce a novel +framework for realism transfer aimed at enhancing the realism of synthetically +generated face images. Our method leverages the large-scale face foundation +model, and we adapt the pipeline for realism enhancement. By integrating the +controllable aspects of the graphics pipeline with our realism enhancement +technique, we generate a large amount of realistic variations, combining the +advantages of both approaches. Our empirical evaluations demonstrate that +models trained using our enhanced dataset significantly improve the performance +of face recognition systems over the baseline. The source code and dataset will +be publicly accessible at the following link: +https://www.idiap.ch/paper/digi2real + +
+
+ comment: The dataset would be available here: + https://www.idiap.ch/paper/digi2real Accepted for Publication in WACV 2025 +
+
+
+
+
+ + ♻ ☆ MambaTalk: Efficient Holistic Gesture Synthesis with Selective State + Space Models + + +
+ Gesture synthesis is a vital realm of human-computer interaction, with +wide-ranging applications across various fields like film, robotics, and +virtual reality. Recent advancements have utilized the diffusion model and +attention mechanisms to improve gesture synthesis. However, due to the high +computational complexity of these techniques, generating long and diverse +sequences with low latency remains a challenge. We explore the potential of +state space models (SSMs) to address the challenge, implementing a two-stage +modeling strategy with discrete motion priors to enhance the quality of +gestures. Leveraging the foundational Mamba block, we introduce MambaTalk, +enhancing gesture diversity and rhythm through multimodal integration. +Extensive experiments demonstrate that our method matches or exceeds the +performance of state-of-the-art models. + +
+
+ comment: NeurlPS 2024, Camera Ready +
+
+
+
+
+ + ♻ ☆ Audio-Agent: Leveraging LLMs For Audio Generation, Editing and + Composition + + +
+ We introduce Audio-Agent, a multimodal framework for audio generation, +editing and composition based on text or video inputs. Conventional approaches +for text-to-audio (TTA) tasks often make single-pass inferences from text +descriptions. While straightforward, this design struggles to produce +high-quality audio when given complex text conditions. In our method, we +utilize a pre-trained TTA diffusion network as the audio generation agent to +work in tandem with GPT-4, which decomposes the text condition into atomic, +specific instructions and calls the agent for audio generation. In doing so, +Audio-Agent can generate high-quality audio that is closely aligned with the +provided text or video exhibiting complex and multiple events, while supporting +variable-length and variable-volume generation. For video-to-audio (VTA) tasks, +most existing methods require training a timestamp detector to synchronize +video events with the generated audio, a process that can be tedious and +time-consuming. Instead, we propose a simpler approach by fine-tuning a +pre-trained Large Language Model (LLM), e.g., Gemma2-2B-it, to obtain both +semantic and temporal conditions that bridge the video and audio modality. +Consequently, our framework contributes a comprehensive solution for both TTA +and VTA tasks without substantial computational overhead in training. + +
+
+
+
+
+ + ♻ ☆ EventHallusion: Diagnosing Event Hallucinations in Video LLMs + + +
+ Recently, Multimodal Large Language Models (MLLMs) have made significant +progress in the video comprehension field. Despite remarkable content reasoning +and instruction following capabilities they demonstrated, the hallucination +problem of these VideoLLMs is less explored compared with its counterpart in +the image domain. To mitigate this gap, we propose EventHallusion, a novel +benchmark that focuses on assessing the VideoLLMs' hallucination toward event, +the crux of video analysis. From a hallucination attribution perspective, our +EventHallusion benchmark is curated to assess a VideoLLM's susceptibility +toward language priors and vision-language biases. On the other hand, we also +propose a simple yet effective method, called Temporal Contrastive Decoding +(TCD), to tackle the hallucination problems of VideoLLMs. The proposed TCD +method rectifies the model's bias toward its priors during the decoding stage +by comparing the original video with a modified version, in which temporal cues +are disrupted. Through comprehensive evaluation of eight open-source and two +closed-source VideoLLMs on the proposed EventHallusion benchmark, we observe +that the open-source models suffer significantly from hallucination problems, +whereas the closed-source ones perform markedly better. By further equipping +open-source VideoLLMs with the proposed TCD approach, evident performance +improvements are achieved across most metrics in the EventHallusion benchmark. +Our codes and benchmark data are available at +https://github.com/Stevetich/EventHallusion. + +
+
+
+
+
+ + ♻ ☆ Fast, Scale-Adaptive, and Uncertainty-Aware Downscaling of Earth System + Model Fields with Generative Machine Learning + + +
+ Accurate and high-resolution Earth system model (ESM) simulations are +essential to assess the ecological and socio-economic impacts of anthropogenic +climate change, but are computationally too expensive to be run at sufficiently +high spatial resolution. Recent machine learning approaches have shown +promising results in downscaling ESM simulations, outperforming +state-of-the-art statistical approaches. However, existing methods require +computationally costly retraining for each ESM and extrapolate poorly to +climates unseen during training. We address these shortcomings by learning a +consistency model (CM) that efficiently and accurately downscales arbitrary ESM +simulations without retraining in a zero-shot manner. Our approach yields +probabilistic downscaled fields at a resolution only limited by the +observational reference data. We show that the CM outperforms state-of-the-art +diffusion models at a fraction of computational cost while maintaining high +controllability on the downscaling task. Further, our method generalizes to +climate states unseen during training without explicitly formulated physical +constraints. + +
+
+
+
+
+ + ♻ ☆ Learning Symmetries via Weight-Sharing with Doubly Stochastic Tensors + + +
+ Group equivariance has emerged as a valuable inductive bias in deep learning, +enhancing generalization, data efficiency, and robustness. Classically, group +equivariant methods require the groups of interest to be known beforehand, +which may not be realistic for real-world data. Additionally, baking in fixed +group equivariance may impose overly restrictive constraints on model +architecture. This highlights the need for methods that can dynamically +discover and apply symmetries as soft constraints. For neural network +architectures, equivariance is commonly achieved through group transformations +of a canonical weight tensor, resulting in weight sharing over a given group +$G$. In this work, we propose to learn such a weight-sharing scheme by defining +a collection of learnable doubly stochastic matrices that act as soft +permutation matrices on canonical weight tensors, which can take regular group +representations as a special case. This yields learnable kernel transformations +that are jointly optimized with downstream tasks. We show that when the dataset +exhibits strong symmetries, the permutation matrices will converge to regular +group representations and our weight-sharing networks effectively become +regular group convolutions. Additionally, the flexibility of the method enables +it to effectively pick up on partial symmetries. + +
+
+ comment: 19 pages, 14 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ TextureCrop: Enhancing Synthetic Image Detection through Texture-based + Cropping + + +
+ Generative AI technologies produce increasingly realistic imagery, which, +despite its potential for creative applications, can also be misused to produce +misleading and harmful content. This renders Synthetic Image Detection (SID) +methods essential for identifying AI-generated content online. State-of-the-art +SID methods typically resize or center-crop input images due to architectural +or computational constraints, which hampers the detection of artifacts that +appear in high-resolution images. To address this limitation, we propose +TextureCrop, an image pre-processing component that can be plugged in any +pre-trained SID model to improve its performance. By focusing on high-frequency +image parts where generative artifacts are prevalent, TextureCrop enhances SID +performance with manageable memory requirements. Experimental results +demonstrate a consistent improvement in AUC across various detectors by 6.1% +compared to center cropping and by 15% compared to resizing, across +high-resolution images from the Forensynths, Synthbuster and TWIGMA datasets. +Code available at https : //github.com/mever-team/texture-crop. + +
+
+ comment: 10 pages, 7 images +
+
+
+
+
+ + ♻ ☆ Transformers and Large Language Models for Efficient Intrusion Detection + Systems: A Comprehensive Survey + + +
+ With significant advancements in Transformers LLMs, NLP has extended its +reach into many research fields due to its enhanced capabilities in text +generation and user interaction. One field benefiting greatly from these +advancements is cybersecurity. In cybersecurity, many parameters that need to +be protected and exchanged between senders and receivers are in the form of +text and tabular data, making NLP a valuable tool in enhancing the security +measures of communication protocols. This survey paper provides a comprehensive +analysis of the utilization of Transformers and LLMs in cyber-threat detection +systems. The methodology of paper selection and bibliometric analysis is +outlined to establish a rigorous framework for evaluating existing research. +The fundamentals of Transformers are discussed, including background +information on various cyber-attacks and datasets commonly used in this field. +The survey explores the application of Transformers in IDSs, focusing on +different architectures such as Attention-based models, LLMs like BERT and GPT, +CNN/LSTM-Transformer hybrids, emerging approaches like ViTs, among others. +Furthermore, it explores the diverse environments and applications where +Transformers and LLMs-based IDS have been implemented, including computer +networks, IoT devices, critical infrastructure protection, cloud computing, +SDN, as well as in autonomous vehicles. The paper also addresses research +challenges and future directions in this area, identifying key issues such as +interpretability, scalability, and adaptability to evolving threats, and more. +Finally, the conclusion summarizes the findings and highlights the significance +of Transformers and LLMs in enhancing cyber-threat detection capabilities, +while also outlining potential avenues for further research and development. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2405.04760 by other authors +
+
+
+
+
+ + ♻ ☆ Rethinking Decoders for Transformer-based Semantic Segmentation: A + Compression Perspective NeurIPS2024 + + +
+ State-of-the-art methods for Transformer-based semantic segmentation +typically adopt Transformer decoders that are used to extract additional +embeddings from image embeddings via cross-attention, refine either or both +types of embeddings via self-attention, and project image embeddings onto the +additional embeddings via dot-product. Despite their remarkable success, these +empirical designs still lack theoretical justifications or interpretations, +thus hindering potentially principled improvements. In this paper, we argue +that there are fundamental connections between semantic segmentation and +compression, especially between the Transformer decoders and Principal +Component Analysis (PCA). From such a perspective, we derive a white-box, fully +attentional DEcoder for PrIncipled semantiC segemenTation (DEPICT), with the +interpretations as follows: 1) the self-attention operator refines image +embeddings to construct an ideal principal subspace that aligns with the +supervision and retains most information; 2) the cross-attention operator seeks +to find a low-rank approximation of the refined image embeddings, which is +expected to be a set of orthonormal bases of the principal subspace and +corresponds to the predefined classes; 3) the dot-product operation yields +compact representation for image embeddings as segmentation masks. Experiments +conducted on dataset ADE20K find that DEPICT consistently outperforms its +black-box counterpart, Segmenter, and it is light weight and more robust. + +
+
+ comment: NeurIPS2024. Code:https://github.com/QishuaiWen/DEPICT/ +
+
+
+
+
+ + ♻ ☆ Enhanced Masked Image Modeling to Avoid Model Collapse on Multi-modal + MRI Datasets + + +
+ Multi-modal magnetic resonance imaging (MRI) provides information of lesions +for computer-aided diagnosis from different views. Deep learning algorithms are +suitable for identifying specific anatomical structures, segmenting lesions, +and classifying diseases. Manual labels are limited due to the high expense, +which hinders further improvement of accuracy. Self-supervised learning, +particularly masked image modeling (MIM), has shown promise in utilizing +unlabeled data. However, we spot model collapse when applying MIM to +multi-modal MRI datasets. The performance of downstream tasks does not see any +improvement following the collapsed model. To solve model collapse, we analyze +and address it in two types: complete collapse and dimensional collapse. We +find complete collapse occurs because the collapsed loss value in multi-modal +MRI datasets falls below the normally converged loss value. Based on this, the +hybrid mask pattern (HMP) masking strategy is introduced to elevate the +collapsed loss above the normally converged loss value and avoid complete +collapse. Additionally, we reveal that dimensional collapse stems from +insufficient feature uniformity in MIM. We mitigate dimensional collapse by +introducing the pyramid barlow twins (PBT) module as an explicit regularization +method. Overall, we construct the enhanced MIM (E-MIM) with HMP and PBT module +to avoid model collapse multi-modal MRI. Experiments are conducted on three +multi-modal MRI datasets to validate the effectiveness of our approach in +preventing both types of model collapse. By preventing model collapse, the +training of the model becomes more stable, resulting in a decent improvement in +performance for segmentation and classification tasks. The code is available at +https://github.com/LinxuanHan/E-MIM. + +
+
+
+
+
+ + ♻ ☆ Perception Matters: Enhancing Embodied AI with Uncertainty-Aware + Semantic Segmentation + + +
+ Embodied AI has made significant progress acting in unexplored environments. +However, tasks such as object search have largely focused on efficient policy +learning. In this work, we identify several gaps in current search methods: +They largely focus on dated perception models, neglect temporal aggregation, +and transfer from ground truth directly to noisy perception at test time, +without accounting for the resulting overconfidence in the perceived state. We +address the identified problems through calibrated perception probabilities and +uncertainty across aggregation and found decisions, thereby adapting the models +for sequential tasks. The resulting methods can be directly integrated with +pretrained models across a wide family of existing search approaches at no +additional training cost. We perform extensive evaluations of aggregation +methods across both different semantic perception models and policies, +confirming the importance of calibrated uncertainties in both the aggregation +and found decisions. We make the code and trained models available at +https://semantic-search.cs.uni-freiburg.de. + +
+
+
+
+
+ + ♻ ☆ TextureDiffusion: Target Prompt Disentangled Editing for Various Texture + Transfer + + +
+ Recently, text-guided image editing has achieved significant success. +However, existing methods can only apply simple textures like wood or gold when +changing the texture of an object. Complex textures such as cloud or fire pose +a challenge. This limitation stems from that the target prompt needs to contain +both the input image content and , restricting the texture +representation. In this paper, we propose TextureDiffusion, a tuning-free image +editing method applied to various texture transfer. Initially, the target +prompt is directly set to "", making the texture disentangled from the +input image content to enhance texture representation. Subsequently, query +features in self-attention and features in residual blocks are utilized to +preserve the structure of the input image. Finally, to maintain the background, +we introduce an edit localization technique which blends the self-attention +results and the intermediate latents. Comprehensive experiments demonstrate +that TextureDiffusion can harmoniously transfer various textures with excellent +structure and background preservation. Code is publicly available at +https://github.com/THU-CVML/TextureDiffusion + +
+
+
+
+
+ + ♻ ☆ ONER: Online Experience Replay for Incremental Anomaly Detection + + +
+ Incremental anomaly detection sequentially recognizes abnormal regions in +novel categories for dynamic industrial scenarios. This remains highly +challenging due to knowledge overwriting and feature conflicts, leading to +catastrophic forgetting. In this work, we propose ONER, an end-to-end ONline +Experience Replay method, which efficiently mitigates catastrophic forgetting +while adapting to new tasks with minimal cost. Specifically, our framework +utilizes two types of experiences from past tasks: decomposed prompts and +semantic prototypes, addressing both model parameter updates and feature +optimization. The decomposed prompts consist of learnable components that +assemble to produce attention-conditioned prompts. These prompts reuse +previously learned knowledge, enabling model to learn novel tasks effectively. +The semantic prototypes operate at both pixel and image levels, performing +regularization in the latent feature space to prevent forgetting across various +tasks. Extensive experiments demonstrate that our method achieves +state-of-the-art performance in incremental anomaly detection with +significantly reduced forgetting, as well as efficiently adapting to new +categories with minimal costs. These results confirm the efficiency and +stability of ONER, making it a powerful solution for real-world applications. + +
+
+
+
+
+ + ♻ ☆ HyFusion: Enhanced Reception Field Transformer for Hyperspectral Image + Fusion RSS 2025 + + +
+ Hyperspectral image (HSI) fusion addresses the challenge of reconstructing +High-Resolution HSIs (HR-HSIs) from High-Resolution Multispectral images +(HR-MSIs) and Low-Resolution HSIs (LR-HSIs), a critical task given the high +costs and hardware limitations associated with acquiring high-quality HSIs. +While existing methods leverage spatial and spectral relationships, they often +suffer from limited receptive fields and insufficient feature utilization, +leading to suboptimal performance. Furthermore, the scarcity of high-quality +HSI data highlights the importance of efficient data utilization to maximize +reconstruction quality. To address these issues, we propose HyFusion, a novel +Dual-Coupled Network (DCN) framework designed to enhance cross-domain feature +extraction and enable effective feature map reusing. The framework first +processes HR-MSI and LR-HSI inputs through specialized subnetworks that +mutually enhance each other during feature extraction, preserving complementary +spatial and spectral details. At its core, HyFusion utilizes an Enhanced +Reception Field Block (ERFB), which combines shifting-window attention and +dense connections to expand the receptive field, effectively capturing +long-range dependencies while minimizing information loss. Extensive +experiments demonstrate that HyFusion achieves state-of-the-art performance in +HR-MSI/LR-HSI fusion, significantly improving reconstruction quality while +maintaining a compact model size and computational efficiency. By integrating +enhanced receptive fields and feature map reusing into a coupled network +architecture, HyFusion provides a practical and effective solution for HSI +fusion in resource-constrained scenarios, setting a new benchmark in +hyperspectral imaging. Our code will be publicly available. + +
+
+ comment: Submitted to IGARSS 2025 +
+
+
+
+
+ + ♻ ☆ Knowledge-Guided Prompt Learning for Deepfake Facial Image Detection + + +
+ Recent generative models demonstrate impressive performance on synthesizing +photographic images, which makes humans hardly to distinguish them from +pristine ones, especially on realistic-looking synthetic facial images. +Previous works mostly focus on mining discriminative artifacts from vast amount +of visual data. However, they usually lack the exploration of prior knowledge +and rarely pay attention to the domain shift between training categories (e.g., +natural and indoor objects) and testing ones (e.g., fine-grained human facial +images), resulting in unsatisfactory detection performance. To address these +issues, we propose a novel knowledge-guided prompt learning method for deepfake +facial image detection. Specifically, we retrieve forgery-related prompts from +large language models as expert knowledge to guide the optimization of +learnable prompts. Besides, we elaborate test-time prompt tuning to alleviate +the domain shift, achieving significant performance improvement and boosting +the application in real-world scenarios. Extensive experiments on +DeepFakeFaceForensics dataset show that our proposed approach notably +outperforms state-of-the-art methods. + +
+
+ comment: Accepted by ICASSP 2025 +
+
+
+
+
+ + ♻ ☆ PastNet: Introducing Physical Inductive Biases for Spatio-temporal Video + Prediction + + +
+ In this paper, we investigate the challenge of spatio-temporal video +prediction task, which involves generating future video frames based on +historical spatio-temporal observation streams. Existing approaches typically +utilize external information such as semantic maps to improve video prediction +accuracy, which often neglect the inherent physical knowledge embedded within +videos. Worse still, their high computational costs could impede their +applications for high-resolution videos. To address these constraints, we +introduce a novel framework called \underline{P}hysics-\underline{a}ssisted +\underline{S}patio-\underline{t}emporal \underline{Net}work (PastNet) for +high-quality video prediction. The core of PastNet lies in incorporating a +spectral convolution operator in the Fourier domain, which efficiently +introduces inductive biases from the underlying physical laws. Additionally, we +employ a memory bank with the estimated intrinsic dimensionality to discretize +local features during the processing of complex spatio-temporal signals, +thereby reducing computational costs and facilitating efficient high-resolution +video prediction. Extensive experiments on various widely-used spatio-temporal +video benchmarks demonstrate the effectiveness and efficiency of the proposed +PastNet compared with a range of state-of-the-art methods, particularly in +high-resolution scenarios. + +
+
+ comment: 11 +
+
+
+
+
+ + ♻ ☆ DehazeGS: Seeing Through Fog with 3D Gaussian Splatting + + +
+ Current novel view synthesis tasks primarily rely on high-quality and clear +images. However, in foggy scenes, scattering and attenuation can significantly +degrade the reconstruction and rendering quality. Although NeRF-based dehazing +reconstruction algorithms have been developed, their use of deep fully +connected neural networks and per-ray sampling strategies leads to high +computational costs. Moreover, NeRF's implicit representation struggles to +recover fine details from hazy scenes. In contrast, recent advancements in 3D +Gaussian Splatting achieve high-quality 3D scene reconstruction by explicitly +modeling point clouds into 3D Gaussians. In this paper, we propose leveraging +the explicit Gaussian representation to explain the foggy image formation +process through a physically accurate forward rendering process. We introduce +DehazeGS, a method capable of decomposing and rendering a fog-free background +from participating media using only muti-view foggy images as input. We model +the transmission within each Gaussian distribution to simulate the formation of +fog. During this process, we jointly learn the atmospheric light and scattering +coefficient while optimizing the Gaussian representation of the hazy scene. In +the inference stage, we eliminate the effects of scattering and attenuation on +the Gaussians and directly project them onto a 2D plane to obtain a clear view. +Experiments on both synthetic and real-world foggy datasets demonstrate that +DehazeGS achieves state-of-the-art performance in terms of both rendering +quality and computational efficiency. + +
+
+ comment: 9 pages,4 figures +
+
+
+
+
+ + ♻ ☆ Spacewalker: Traversing Representation Spaces for Fast Interactive + Exploration and Annotation of Unstructured Data + + +
+ In industries such as healthcare, finance, and manufacturing, analysis of +unstructured textual data presents significant challenges for analysis and +decision making. Uncovering patterns within large-scale corpora and +understanding their semantic impact is critical, but depends on domain experts +or resource-intensive manual reviews. In response, we introduce Spacewalker in +this system demonstration paper, an interactive tool designed to analyze, +explore, and annotate data across multiple modalities. It allows users to +extract data representations, visualize them in low-dimensional spaces and +traverse large datasets either exploratory or by querying regions of interest. +We evaluated Spacewalker through extensive experiments and annotation studies, +assessing its efficacy in improving data integrity verification and annotation. +We show that Spacewalker reduces time and effort compared to traditional +methods. The code of this work is open-source and can be found at: +https://github.com/code-lukas/Spacewalker + +
+
+
+
+
+ + ♻ ☆ Knowledge Transfer and Domain Adaptation for Fine-Grained Remote Sensing + Image Segmentation + + +
+ Fine-grained remote sensing image segmentation is essential for accurately +identifying detailed objects in remote sensing images. Recently, vision +transformer models (VTMs) pre-trained on large-scale datasets have demonstrated +strong zero-shot generalization. However, directly applying them to specific +tasks may lead to domain shift. We introduce a novel end-to-end learning +paradigm combining knowledge guidance with domain refinement to enhance +performance. We present two key components: the Feature Alignment Module (FAM) +and the Feature Modulation Module (FMM). FAM aligns features from a CNN-based +backbone with those from the pretrained VTM's encoder using channel +transformation and spatial interpolation, and transfers knowledge via KL +divergence and L2 normalization constraint. FMM further adapts the knowledge to +the specific domain to address domain shift. We also introduce a fine-grained +grass segmentation dataset and demonstrate, through experiments on two +datasets, that our method achieves a significant improvement of 2.57 mIoU on +the grass dataset and 3.73 mIoU on the cloud dataset. The results highlight the +potential of combining knowledge transfer and domain adaptation to overcome +domain-related challenges and data limitations. The project page is available +at https://xavierjiezou.github.io/KTDA/. + +
+
+ comment: 6 pages, 3 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Edicho: Consistent Image Editing in the Wild + + +
+ As a verified need, consistent editing across in-the-wild images remains a +technical challenge arising from various unmanageable factors, like object +poses, lighting conditions, and photography environments. Edicho steps in with +a training-free solution based on diffusion models, featuring a fundamental +design principle of using explicit image correspondence to direct editing. +Specifically, the key components include an attention manipulation module and a +carefully refined classifier-free guidance (CFG) denoising strategy, both of +which take into account the pre-estimated correspondence. Such an +inference-time algorithm enjoys a plug-and-play nature and is compatible to +most diffusion-based editing methods, such as ControlNet and BrushNet. +Extensive results demonstrate the efficacy of Edicho in consistent cross-image +editing under diverse settings. We will release the code to facilitate future +studies. + +
+
+ comment: Project page: https://ant-research.github.io/edicho/ +
+
+
+
+
+ + ♻ ☆ MoPE: Mixture of Prompt Experts for Parameter-Efficient and Scalable + Multimodal Fusion + + +
+ Despite the demonstrated parameter efficiency of prompt-based multimodal +fusion methods, their limited adaptivity and expressiveness often result in +suboptimal performance compared to other tuning approaches. In this paper, we +introduce the Mixture of Prompt Experts (MoPE), the first technique designed to +overcome these limitations by decomposing standard prompts to capture +instance-level features adaptively. Building on this decomposition, MoPE +enhances prompt fusion's expressiveness by leveraging multimodal pairing priors +to route the most effective prompt for each instance dynamically. Compared to +vanilla prompting, our MoPE-based fusion method exhibits greater +expressiveness, scaling more effectively with the training data and the overall +number of trainable parameters. We also investigate regularization terms for +expert routing, which lead to emergent expert specialization with enhanced +adaptiveness and interpretablity. Extensive experiments across six multimodal +datasets spanning four modalities demonstrate state-of-the-art performance for +prompt fusion, matching or even surpassing the performance of fine-tuning while +requiring only 0.8% of the trainable parameters. Project homepage: +https://github.com/songrise/MoPE + +
+
+ comment: Under Review, Extended version of arxiv:2312.03734 +
+
+
+
+
+ + ♻ ☆ BIOMEDICA: An Open Biomedical Image-Caption Archive, Dataset, and + Vision-Language Models Derived from Scientific Literature + + +
+ The development of vision-language models (VLMs) is driven by large-scale and +diverse multimodal datasets. However, progress toward generalist biomedical +VLMs is limited by the lack of annotated, publicly accessible datasets across +biology and medicine. Existing efforts are restricted to narrow domains, +missing the full diversity of biomedical knowledge encoded in scientific +literature. To address this gap, we introduce BIOMEDICA, a scalable, +open-source framework to extract, annotate, and serialize the entirety of the +PubMed Central Open Access subset into an easy-to-use, publicly accessible +dataset. Our framework produces a comprehensive archive with over 24 million +unique image-text pairs from over 6 million articles. Metadata and +expert-guided annotations are also provided. We demonstrate the utility and +accessibility of our resource by releasing BMCA-CLIP, a suite of CLIP-style +models continuously pre-trained on the BIOMEDICA dataset via streaming, +eliminating the need to download 27 TB of data locally. On average, our models +achieve state-of-the-art performance across 40 tasks - spanning pathology, +radiology, ophthalmology, dermatology, surgery, molecular biology, +parasitology, and cell biology - excelling in zero-shot classification with a +6.56% average improvement (as high as 29.8% and 17.5% in dermatology and +ophthalmology, respectively), and stronger image-text retrieval, all while +using 10x less compute. To foster reproducibility and collaboration, we release +our codebase and dataset for the broader research community. + +
+
+
+
+
+ + ♻ ☆ Recognizing Artistic Style of Archaeological Image Fragments Using Deep + Style Extrapolation + + +
+ Ancient artworks obtained in archaeological excavations usually suffer from a +certain degree of fragmentation and physical degradation. Often, fragments of +multiple artifacts from different periods or artistic styles could be found on +the same site. With each fragment containing only partial information about its +source, and pieces from different objects being mixed, categorizing broken +artifacts based on their visual cues could be a challenging task, even for +professionals. As classification is a common function of many machine learning +models, the power of modern architectures can be harnessed for efficient and +accurate fragment classification. In this work, we present a generalized +deep-learning framework for predicting the artistic style of image fragments, +achieving state-of-the-art results for pieces with varying styles and +geometries. + +
+
+ comment: To be published in the 27th International Conference on + Human-Computer Interaction (HCII 2025) +
+
+
+
+
+ + ♻ ☆ Flash Window Attention: speedup the attention computation for Swin + Transformer + + +
+ To address the high resolution of image pixels, the Swin Transformer +introduces window attention. This mechanism divides an image into +non-overlapping windows and restricts attention computation to within each +window, significantly enhancing computational efficiency. To further optimize +this process, one might consider replacing standard attention with flash +attention, which has proven to be more efficient in language models. However, a +direct substitution is ineffective. Flash attention is designed for long +sequences, whereas window attention deals with shorter sequences but must +handle numerous of them in parallel. In this report, we present an optimized +solution called Flash Window Attention, tailored specifically for window +attention. Flash Window Attention improves attention computation efficiency by +up to 300% and enhances end-to-end runtime efficiency by up to 30%. Our code is +available online. + +
+
+
+
+
+ + ♻ ☆ Analyzing Infrastructure LiDAR Placement with Realistic LiDAR Simulation + Library ICRA'23 + + +
+ Recently, Vehicle-to-Everything(V2X) cooperative perception has attracted +increasing attention. Infrastructure sensors play a critical role in this +research field; however, how to find the optimal placement of infrastructure +sensors is rarely studied. In this paper, we investigate the problem of +infrastructure sensor placement and propose a pipeline that can efficiently and +effectively find optimal installation positions for infrastructure sensors in a +realistic simulated environment. To better simulate and evaluate LiDAR +placement, we establish a Realistic LiDAR Simulation library that can simulate +the unique characteristics of different popular LiDARs and produce +high-fidelity LiDAR point clouds in the CARLA simulator. Through simulating +point cloud data in different LiDAR placements, we can evaluate the perception +accuracy of these placements using multiple detection models. Then, we analyze +the correlation between the point cloud distribution and perception accuracy by +calculating the density and uniformity of regions of interest. Experiments show +that when using the same number and type of LiDAR, the placement scheme +optimized by our proposed method improves the average precision by 15%, +compared with the conventional placement scheme in the standard lane scene. We +also analyze the correlation between perception performance in the region of +interest and LiDAR point cloud distribution and validate that density and +uniformity can be indicators of performance. Both the RLS Library and related +code will be released at https://github.com/PJLab-ADG/PCSim. + +
+
+ comment: 7 pages, 6 figures, accepted to the IEEE International Conference on + Robotics and Automation (ICRA'23) +
+
+
+
+
+ + ♻ ☆ A Cascaded Dilated Convolution Approach for Mpox Lesion Classification + + +
+ The global outbreak of the Mpox virus, classified as a Public Health +Emergency of International Concern (PHEIC) by the World Health Organization, +presents significant diagnostic challenges due to its visual similarity to +other skin lesion diseases. Traditional diagnostic methods for Mpox, which rely +on clinical symptoms and laboratory tests, are slow and labor intensive. Deep +learning-based approaches for skin lesion classification offer a promising +alternative. However, developing a model that balances efficiency with accuracy +is crucial to ensure reliable and timely diagnosis without compromising +performance. This study introduces the Cascaded Atrous Group Attention (CAGA) +framework to address these challenges, combining the Cascaded Atrous Attention +module and the Cascaded Group Attention mechanism. The Cascaded Atrous +Attention module utilizes dilated convolutions and cascades the outputs to +enhance multi-scale representation. This is integrated into the Cascaded Group +Attention mechanism, which reduces redundancy in Multi-Head Self-Attention. By +integrating the Cascaded Atrous Group Attention module with EfficientViT-L1 as +the backbone architecture, this approach achieves state-of-the-art performance, +reaching an accuracy of 98% on the Mpox Close Skin Image (MCSI) dataset while +reducing model parameters by 37.5% compared to the original EfficientViT-L1. +The model's robustness is demonstrated through extensive validation on two +additional benchmark datasets, where it consistently outperforms existing +approaches. + +
+
+ comment: 8 pages, 4 figures, Submitted to Medical Imaging with Deep Learning +
+
+
+
+
+ + ♻ ☆ Implicit Neural Representations with Fourier Kolmogorov-Arnold Networks + + +
+ Implicit neural representations (INRs) use neural networks to provide +continuous and resolution-independent representations of complex signals with a +small number of parameters. However, existing INR models often fail to capture +important frequency components specific to each task. To address this issue, in +this paper, we propose a Fourier Kolmogorov Arnold network (FKAN) for INRs. The +proposed FKAN utilizes learnable activation functions modeled as Fourier series +in the first layer to effectively control and learn the task-specific frequency +components. In addition, the activation functions with learnable Fourier +coefficients improve the ability of the network to capture complex patterns and +details, which is beneficial for high-resolution and high-dimensional data. +Experimental results show that our proposed FKAN model outperforms three +state-of-the-art baseline schemes, and improves the peak signal-to-noise ratio +(PSNR) and structural similarity index measure (SSIM) for the image +representation task and intersection over union (IoU) for the 3D occupancy +volume representation task, respectively. The code is available at +github.com/Ali-Meh619/FKAN. + +
+
+ comment: Accepted for publication in Proc. IEEE ICASSP 2025 +
+
+
+
+
+ + ♻ ☆ Gradient descent with generalized Newton's method + + +
+ We propose the generalized Newton's method (GeN) -- a Hessian-informed +approach that applies to any optimizer such as SGD and Adam, and covers the +Newton-Raphson method as a sub-case. Our method automatically and dynamically +selects the learning rate that accelerates the convergence, without the +intensive tuning of the learning rate scheduler. In practice, our method is +easily implementable, since it only requires additional forward passes with +almost zero computational overhead (in terms of training time and memory cost), +if the overhead is amortized over many iterations. We present extensive +experiments on language and vision tasks (e.g. GPT and ResNet) to showcase that +GeN optimizers match the state-of-the-art performance, which was achieved with +carefully tuned learning rate schedulers. + +
+
+
+
+
+ + ♻ ☆ MambaTrack: Exploiting Dual-Enhancement for Night UAV Tracking + + +
+ Night unmanned aerial vehicle (UAV) tracking is impeded by the challenges of +poor illumination, with previous daylight-optimized methods demonstrating +suboptimal performance in low-light conditions, limiting the utility of UAV +applications. To this end, we propose an efficient mamba-based tracker, +leveraging dual enhancement techniques to boost night UAV tracking. The +mamba-based low-light enhancer, equipped with an illumination estimator and a +damage restorer, achieves global image enhancement while preserving the details +and structure of low-light images. Additionally, we advance a cross-modal mamba +network to achieve efficient interactive learning between vision and language +modalities. Extensive experiments showcase that our method achieves advanced +performance and exhibits significantly improved computation and memory +efficiency. For instance, our method is 2.8$\times$ faster than CiteTracker and +reduces 50.2$\%$ GPU memory. Our codes are available at +\url{https://github.com/983632847/Awesome-Multimodal-Object-Tracking}. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Dissecting Query-Key Interaction in Vision Transformers + + +
+ Self-attention in vision transformers is often thought to perform perceptual +grouping where tokens attend to other tokens with similar embeddings, which +could correspond to semantically similar features of an object. However, +attending to dissimilar tokens can be beneficial by providing contextual +information. We propose to analyze the query-key interaction by the singular +value decomposition of the interaction matrix (i.e. +${\textbf{W}_q}^\top\textbf{W}_k$). We find that in many ViTs, especially those +with classification training objectives, early layers attend more to similar +tokens, while late layers show increased attention to dissimilar tokens, +providing evidence corresponding to perceptual grouping and contextualization, +respectively. Many of these interactions between features represented by +singular vectors are interpretable and semantic, such as attention between +relevant objects, between parts of an object, or between the foreground and +background. This offers a novel perspective on interpreting the attention +mechanism, which contributes to understanding how transformer models utilize +context and salient features when processing images. + +
+
+
+
+
+ + ♻ ☆ Smartphone-based Eye Tracking System using Edge Intelligence and Model + Optimisation + + +
+ A significant limitation of current smartphone-based eye-tracking algorithms +is their low accuracy when applied to video-type visual stimuli, as they are +typically trained on static images. Also, the increasing demand for real-time +interactive applications like games, VR, and AR on smartphones requires +overcoming the limitations posed by resource constraints such as limited +computational power, battery life, and network bandwidth. Therefore, we +developed two new smartphone eye-tracking techniques for video-type visuals by +combining Convolutional Neural Networks (CNN) with two different Recurrent +Neural Networks (RNN), namely Long Short Term Memory (LSTM) and Gated Recurrent +Unit (GRU). Our CNN+LSTM and CNN+GRU models achieved an average Root Mean +Square Error of 0.955 cm and 1.091 cm, respectively. To address the +computational constraints of smartphones, we developed an edge intelligence +architecture to enhance the performance of smartphone-based eye tracking. We +applied various optimisation methods like quantisation and pruning to deep +learning models for better energy, CPU, and memory usage on edge devices, +focusing on real-time processing. Using model quantisation, the model inference +time in the CNN+LSTM and CNN+GRU models was reduced by 21.72% and 19.50%, +respectively, on edge devices. + +
+
+ comment: I have included the three papers as reference, which are closely + related. We have expanded the future work section to provide a more thorough + discussion of the concepts of "varying lighting conditions" and "dynamic user + environments." We have added a note below Table 4 to clarify the + abbreviations' meaning. Elaborated the role of the Domain Expert within the + presentation layer in Section 4.1 +
+
+
+
+
+ + ♻ ☆ The Collection of a Human Robot Collaboration Dataset for Cooperative + Assembly in Glovebox Environments IJRR + + +
+ Industry 4.0 introduced AI as a transformative solution for modernizing +manufacturing processes. Its successor, Industry 5.0, envisions humans as +collaborators and experts guiding these AI-driven manufacturing solutions. +Developing these techniques necessitates algorithms capable of safe, real-time +identification of human positions in a scene, particularly their hands, during +collaborative assembly. Although substantial efforts have curated datasets for +hand segmentation, most focus on residential or commercial domains. Existing +datasets targeting industrial settings predominantly rely on synthetic data, +which we demonstrate does not effectively transfer to real-world operations. +Moreover, these datasets lack uncertainty estimations critical for safe +collaboration. Addressing these gaps, we present HAGS: Hand and Glove +Segmentation Dataset. This dataset provides challenging examples to build +applications toward hand and glove segmentation in industrial human-robot +collaboration scenarios as well as assess out-of-distribution images, +constructed via green screen augmentations, to determine ML-classifier +robustness. We study state-of-the-art, real-time segmentation models to +evaluate existing methods. Our dataset and baselines are publicly available. + +
+
+ comment: draft paper to be submitted to IJRR +
+
+
+
+
+ + ♻ ☆ A systematic review of the use of Deep Learning in Satellite Imagery for + Agriculture + + +
+ Agricultural research is essential for increasing food production to meet the +requirements of an increasing population in the coming decades. Recently, +satellite technology has been improving rapidly and deep learning has seen much +success in generic computer vision tasks and many application areas which +presents an important opportunity to improve analysis of agricultural land. +Here we present a systematic review of 150 studies to find the current uses of +deep learning on satellite imagery for agricultural research. Although we +identify 5 categories of agricultural monitoring tasks, the majority of the +research interest is in crop segmentation and yield prediction. We found that, +when used, modern deep learning methods consistently outperformed traditional +machine learning across most tasks; the only exception was that Long Short-Term +Memory (LSTM) Recurrent Neural Networks did not consistently outperform Random +Forests (RF) for yield prediction. The reviewed studies have largely adopted +methodologies from generic computer vision, except for one major omission: +benchmark datasets are not utilised to evaluate models across studies, making +it difficult to compare results. Additionally, some studies have specifically +utilised the extra spectral resolution available in satellite imagery, but +other divergent properties of satellite images - such as the hugely different +scales of spatial patterns - are not being taken advantage of in the reviewed +studies. + +
+
+ comment: 23 pages, 5 figures and 10 tables in main paper. Final version, as + submitted and accepted at JSTARS +
+
+
+
+
+ + ♻ ☆ Zero-shot 3D Segmentation of Abdominal Organs in CT Scans Using Segment + Anything Model 2: Adapting Video Tracking Capabilities for 3D Medical Imaging + + +
+ Objectives: To evaluate the zero-shot performance of Segment Anything Model 2 +(SAM 2) in 3D segmentation of abdominal organs in CT scans, and to investigate +the effects of prompt settings on segmentation results. + Materials and Methods: In this retrospective study, we used a subset of the +TotalSegmentator CT dataset from eight institutions to assess SAM 2's ability +to segment eight abdominal organs. Segmentation was initiated from three +different z-coordinate levels (caudal, mid, and cranial levels) of each organ. +Performance was measured using the Dice similarity coefficient (DSC). We also +analyzed the impact of "negative prompts," which explicitly exclude certain +regions from the segmentation process, on accuracy. + Results: 123 patients (mean age, 60.7 \pm 15.5 years; 63 men, 60 women) were +evaluated. As a zero-shot approach, larger organs with clear boundaries +demonstrated high segmentation performance, with mean DSCs as follows: liver +0.821 \pm 0.192, right kidney 0.862 \pm 0.212, left kidney 0.870 \pm 0.154, and +spleen 0.891 \pm 0.131. Smaller organs showed lower performance: gallbladder +0.531 \pm 0.291, pancreas 0.361 \pm 0.197, and adrenal glands, right 0.203 \pm +0.222, left 0.308 \pm 0.234. The initial slice for segmentation and the use of +negative prompts significantly influenced the results. By removing negative +prompts from the input, the DSCs significantly decreased for six organs. + Conclusion: SAM 2 demonstrated promising zero-shot performance in segmenting +certain abdominal organs in CT scans, particularly larger organs. Performance +was significantly influenced by input negative prompts and initial slice +selection, highlighting the importance of optimizing these factors. + +
+
+ comment: 20 pages, 7 figures (including 2 supplemental figure), 4 tables +
+
+
+
+
+ + ♻ ☆ XVertNet: Unsupervised Contrast Enhancement of Vertebral Structures with + Dynamic Self-Tuning Guidance and Multi-Stage Analysis + + +
+ Chest X-rays remain the primary diagnostic tool in emergency medicine, yet +their limited ability to capture fine anatomical details can result in missed +or delayed diagnoses. To address this, we introduce XVertNet, a novel +deep-learning framework designed to enhance vertebral structure visualization +in X-ray images significantly. Our framework introduces two key innovations: +(1) An unsupervised learning architecture that eliminates reliance on manually +labeled training data a persistent bottleneck in medical imaging, and (2) a +dynamic self-tuned internal guidance mechanism featuring an adaptive feedback +loop for real-time image optimization. Extensive validation across four major +public datasets revealed that XVertNet outperforms state-of-the-art enhancement +methods, as demonstrated by improvements in entropy scores, Tenengrad criterion +values, the local phase coherence sharpness index (LPC-SI), and thetone mapped +image quality index (TMQI). Furthermore, clinical validation conducted with two +board-certified radiologists confirmed that the enhanced images enabled more +sensitive detection of subtle vertebral fractures and degenerative changes. The +unsupervised nature of XVertNet facilitates immediate clinical deployment +without requiring additional training overhead. This innovation represents a +transformative advancement in emergency radiology, providing a scalable and +time-efficient solution to enhance diagnostic accuracy in high-pressure +clinical environments. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ♻ ☆ Expressive Text-to-Image Generation with Rich Text + + +
+ Plain text has become a prevalent interface for text-to-image synthesis. +However, its limited customization options hinder users from accurately +describing desired outputs. For example, plain text makes it hard to specify +continuous quantities, such as the precise RGB color value or importance of +each word. Furthermore, creating detailed text prompts for complex scenes is +tedious for humans to write and challenging for text encoders to interpret. To +address these challenges, we propose using a rich-text editor supporting +formats such as font style, size, color, and footnote. We extract each word's +attributes from rich text to enable local style control, explicit token +reweighting, precise color rendering, and detailed region synthesis. We achieve +these capabilities through a region-based diffusion process. We first obtain +each word's region based on attention maps of a diffusion process using plain +text. For each region, we enforce its text attributes by creating +region-specific detailed prompts and applying region-specific guidance, and +maintain its fidelity against plain-text generation through region-based +injections. We present various examples of image generation from rich text and +demonstrate that our method outperforms strong baselines with quantitative +evaluations. + +
+
+ comment: Project webpage: https://rich-text-to-image.github.io/ +
+
+
+
+
+ + ♻ ☆ UrbanIR: Large-Scale Urban Scene Inverse Rendering from a Single Video + + +
+ We present UrbanIR (Urban Scene Inverse Rendering), a new inverse graphics +model that enables realistic, free-viewpoint renderings of scenes under various +lighting conditions with a single video. It accurately infers shape, albedo, +visibility, and sun and sky illumination from wide-baseline videos, such as +those from car-mounted cameras, differing from NeRF's dense view settings. In +this context, standard methods often yield subpar geometry and material +estimates, such as inaccurate roof representations and numerous 'floaters'. +UrbanIR addresses these issues with novel losses that reduce errors in inverse +graphics inference and rendering artifacts. Its techniques allow for precise +shadow volume estimation in the original scene. The model's outputs support +controllable editing, enabling photorealistic free-viewpoint renderings of +night simulations, relit scenes, and inserted objects, marking a significant +improvement over existing state-of-the-art methods. + +
+
+ comment: https://urbaninverserendering.github.io/ +
+
+
+
+
+ + ♻ ☆ SYNAPSE: SYmbolic Neural-Aided Preference Synthesis Engine + + +
+ This paper addresses the problem of preference learning, which aims to align +robot behaviors through learning user specific preferences (e.g. "good +pull-over location") from visual demonstrations. Despite its similarity to +learning factual concepts (e.g. "red door"), preference learning is a +fundamentally harder problem due to its subjective nature and the paucity of +person-specific training data. We address this problem using a novel framework +called SYNAPSE, which is a neuro-symbolic approach designed to efficiently +learn preferential concepts from limited data. SYNAPSE represents preferences +as neuro-symbolic programs, facilitating inspection of individual parts for +alignment, in a domain-specific language (DSL) that operates over images and +leverages a novel combination of visual parsing, large language models, and +program synthesis to learn programs representing individual preferences. We +perform extensive evaluations on various preferential concepts as well as user +case studies demonstrating its ability to align well with dissimilar user +preferences. Our method significantly outperforms baselines, especially when it +comes to out of distribution generalization. We show the importance of the +design choices in the framework through multiple ablation studies. Code, +additional results, and supplementary material can be found on the website: +https://amrl.cs.utexas.edu/synapse + +
+
+ comment: Accepted (oral) at AAAI 25 +
+
+
+
+
+ + ♻ ☆ Enhancing Performance of Point Cloud Completion Networks with + Consistency Loss + + +
+ Point cloud completion networks are conventionally trained to minimize the +disparities between the completed point cloud and the ground-truth counterpart. +However, an incomplete object-level point cloud can have multiple valid +completion solutions when it is examined in isolation. This one-to-many mapping +issue can cause contradictory supervision signals to the network because the +loss function may produce different values for identical input-output pairs of +the network. In many cases, this issue could adversely affect the network +optimization process. In this work, we propose to enhance the conventional +learning objective using a novel completion consistency loss to mitigate the +one-to-many mapping problem. Specifically, the proposed consistency loss ensure +that a point cloud completion network generates a coherent completion solution +for incomplete objects originating from the same source point cloud. +Experimental results across multiple well-established datasets and benchmarks +demonstrated the proposed completion consistency loss have excellent capability +to enhance the completion performance of various existing networks without any +modification to the design of the networks. The proposed consistency loss +enhances the performance of the point completion network without affecting the +inference speed, thereby increasing the accuracy of point cloud completion. +Notably, a state-of-the-art point completion network trained with the proposed +consistency loss can achieve state-of-the-art accuracy on the challenging new +MVP dataset. The code and result of experiment various point completion models +using proposed consistency loss will be available at: +https://github.com/kaist-avelab/ConsistencyLoss . + +
+
+ comment: First version of Paper "Enhancing Performance of Point Cloud + Completion Networks with Consistency Loss" by Kevin Tirta Wijaya and + Christofel Rio Goenawan. In process submission to Neurocomputing Journal 2024 +
+
+
+
+
+ + ♻ ☆ SplatMAP: Online Dense Monocular SLAM with 3D Gaussian Splatting + + +
+ Achieving high-fidelity 3D reconstruction from monocular video remains +challenging due to the inherent limitations of traditional methods like +Structure-from-Motion (SfM) and monocular SLAM in accurately capturing scene +details. While differentiable rendering techniques such as Neural Radiance +Fields (NeRF) address some of these challenges, their high computational costs +make them unsuitable for real-time applications. Additionally, existing 3D +Gaussian Splatting (3DGS) methods often focus on photometric consistency, +neglecting geometric accuracy and failing to exploit SLAM's dynamic depth and +pose updates for scene refinement. We propose a framework integrating dense +SLAM with 3DGS for real-time, high-fidelity dense reconstruction. Our approach +introduces SLAM-Informed Adaptive Densification, which dynamically updates and +densifies the Gaussian model by leveraging dense point clouds from SLAM. +Additionally, we incorporate Geometry-Guided Optimization, which combines +edge-aware geometric constraints and photometric consistency to jointly +optimize the appearance and geometry of the 3DGS scene representation, enabling +detailed and accurate SLAM mapping reconstruction. Experiments on the Replica +and TUM-RGBD datasets demonstrate the effectiveness of our approach, achieving +state-of-the-art results among monocular systems. Specifically, our method +achieves a PSNR of 36.864, SSIM of 0.985, and LPIPS of 0.040 on Replica, +representing improvements of 10.7%, 6.4%, and 49.4%, respectively, over the +previous SOTA. On TUM-RGBD, our method outperforms the closest baseline by +10.2%, 6.6%, and 34.7% in the same metrics. These results highlight the +potential of our framework in bridging the gap between photometric and +geometric dense 3D scene representations, paving the way for practical and +efficient monocular dense reconstruction. + +
+
+
+
+
+ + ♻ ☆ On the Geometry of Deep Learning + + +
+ In this paper, we overview one promising avenue of progress at the +mathematical foundation of deep learning: the connection between deep networks +and function approximation by affine splines (continuous piecewise linear +functions in multiple dimensions). In particular, we will overview work over +the past decade on understanding certain geometrical properties of a deep +network's affine spline mapping, in particular how it tessellates its input +space. As we will see, the affine spline connection and geometrical viewpoint +provide a powerful portal through which to view, analyze, and improve the inner +workings of a deep network. + +
+
+ comment: Accepted for publication at 'Notices of the American Mathematical + Society' +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 37 + +
+
+
+ + ☆ SafeSwarm: Decentralized Safe RL for the Swarm of Drones Landing in + Dense Crowds + + +
+ This paper introduces a safe swarm of drones capable of performing landings +in crowded environments robustly by relying on Reinforcement Learning +techniques combined with Safe Learning. The developed system allows us to teach +the swarm of drones with different dynamics to land on moving landing pads in +an environment while avoiding collisions with obstacles and between agents. + The safe barrier net algorithm was developed and evaluated using a swarm of +Crazyflie 2.1 micro quadrotors, which were tested indoors with the Vicon motion +capture system to ensure precise localization and control. + Experimental results show that our system achieves landing accuracy of 2.25 +cm with a mean time of 17 s and collision-free landings, underscoring its +effectiveness and robustness in real-world scenarios. This work offers a +promising foundation for applications in environments where safety and +precision are paramount. + +
+
+
+
+
+ + ☆ Inductive Learning of Robot Task Knowledge from Raw Data and Online + Expert Feedback + + +
+ The increasing level of autonomy of robots poses challenges of trust and +social acceptance, especially in human-robot interaction scenarios. This +requires an interpretable implementation of robotic cognitive capabilities, +possibly based on formal methods as logics for the definition of task +specifications. However, prior knowledge is often unavailable in complex +realistic scenarios. + In this paper, we propose an offline algorithm based on inductive logic +programming from noisy examples to extract task specifications (i.e., action +preconditions, constraints and effects) directly from raw data of few +heterogeneous (i.e., not repetitive) robotic executions. Our algorithm +leverages on the output of any unsupervised action identification algorithm +from video-kinematic recordings. Combining it with the definition of very +basic, almost task-agnostic, commonsense concepts about the environment, which +contribute to the interpretability of our methodology, we are able to learn +logical axioms encoding preconditions of actions, as well as their effects in +the event calculus paradigm. Since the quality of learned specifications +depends mainly on the accuracy of the action identification algorithm, we also +propose an online framework for incremental refinement of task knowledge from +user feedback, guaranteeing safe execution. Results in a standard manipulation +task and benchmark for user training in the safety-critical surgical robotic +scenario, show the robustness, data- and time-efficiency of our methodology, +with promising results towards the scalability in more complex domains. + +
+
+
+
+
+ + ☆ The Sense of Agency in Assistive Robotics Using Shared Autonomy + + +
+ Sense of agency is one factor that influences people's preferences for robot +assistance and a phenomenon from cognitive science that represents the +experience of control over one's environment. However, in assistive robotics +literature, we often see paradigms that optimize measures like task success and +cognitive load, rather than sense of agency. In fact, prior work has found that +participants sometimes express a preference for paradigms, such as direct +teleoperation, which do not perform well with those other metrics but give more +control to the user. In this work, we focus on a subset of assistance paradigms +for manipulation called shared autonomy in which the system combines control +signals from the user and the automated control. We run a study to evaluate +sense of agency and show that higher robot autonomy during assistance leads to +improved task performance but a decreased sense of agency, indicating a +potential trade-off between task performance and sense of agency. From our +findings, we discuss the relation between sense of agency and optimality, and +we consider a proxy metric for a component of sense of agency which might +enable us to build systems that monitor and maintain sense of agency in real +time. + +
+
+ comment: 10 pages, 8 figure, HRI conference +
+
+
+
+
+ + ☆ Empirical Comparison of Four Stereoscopic Depth Sensing Cameras for + Robotics Applications + + +
+ Depth sensing is an essential technology in robotics and many other fields. +Many depth sensing (or RGB-D) cameras are available on the market and selecting +the best one for your application can be challenging. In this work, we tested +four stereoscopic RGB-D cameras that sense the distance by using two images +from slightly different views. We empirically compared four cameras (Intel +RealSense D435, Intel RealSense D455, StereoLabs ZED 2, and Luxonis OAK-D Pro) +in three scenarios: (i) planar surface perception, (ii) plastic doll +perception, (iii) household object perception (YCB dataset). We recorded and +evaluated more than 3,000 RGB-D frames for each camera. For table-top robotics +scenarios with distance to objects up to one meter, the best performance is +provided by the D435 camera. For longer distances, the other three models +perform better, making them more suitable for some mobile robotics +applications. OAK-D Pro additionally offers integrated AI modules (e.g., object +and human keypoint detection). ZED 2 is not a standalone device and requires a +computer with a GPU for depth data acquisition. All data (more than 12,000 +RGB-D frames) are publicly available at https://osf.io/f2seb. + +
+
+
+
+
+ + ☆ Efficiently Closing Loops in LiDAR-Based SLAM Using Point Cloud Density + Maps + + +
+ Consistent maps are key for most autonomous mobile robots. They often use +SLAM approaches to build such maps. Loop closures via place recognition help +maintain accurate pose estimates by mitigating global drift. This paper +presents a robust loop closure detection pipeline for outdoor SLAM with +LiDAR-equipped robots. The method handles various LiDAR sensors with different +scanning patterns, field of views and resolutions. It generates local maps from +LiDAR scans and aligns them using a ground alignment module to handle both +planar and non-planar motion of the LiDAR, ensuring applicability across +platforms. The method uses density-preserving bird's eye view projections of +these local maps and extracts ORB feature descriptors from them for place +recognition. It stores the feature descriptors in a binary search tree for +efficient retrieval, and self-similarity pruning addresses perceptual aliasing +in repetitive environments. Extensive experiments on public and self-recorded +datasets demonstrate accurate loop closure detection, long-term localization, +and cross-platform multi-map alignment, agnostic to the LiDAR scanning +patterns, fields of view, and motion profiles. + +
+
+
+
+
+ + ☆ Fast-Revisit Coverage Path Planning for Autonomous Mobile Patrol Robots + Using Long-Range Sensor Information + + +
+ The utilization of Unmanned Ground Vehicles (UGVs) for patrolling industrial +sites has expanded significantly. These UGVs typically are equipped with +perception systems, e.g., computer vision, with limited range due to sensor +limitations or site topology. High-level control of the UGVs requires Coverage +Path Planning (CPP) algorithms that navigate all relevant waypoints and +promptly start the next cycle. In this paper, we propose the novel Fast-Revisit +Coverage Path Planning (FaRe-CPP) algorithm using a greedy heuristic approach +to propose waypoints for maximum coverage area and a random search-based path +optimization technique to obtain a path along the proposed waypoints with +minimum revisit time. We evaluated the algorithm in a simulated environment +using Gazebo and a camera-equipped TurtleBot3 against a number of existing +algorithms. Compared to their average revisit times and path lengths, our +FaRe-CPP algorithm approximately showed a 45% and 40% reduction, respectively, +in these highly relevant performance indicators. + +
+
+
+
+
+ + ☆ Evaluation of Artificial Intelligence Methods for Lead Time Prediction + in Non-Cycled Areas of Automotive Production + + +
+ The present study examines the effectiveness of applying Artificial +Intelligence methods in an automotive production environment to predict unknown +lead times in a non-cycle-controlled production area. Data structures are +analyzed to identify contextual features and then preprocessed using one-hot +encoding. Methods selection focuses on supervised machine learning techniques. +In supervised learning methods, regression and classification methods are +evaluated. Continuous regression based on target size distribution is not +feasible. Classification methods analysis shows that Ensemble Learning and +Support Vector Machines are the most suitable. Preliminary study results +indicate that gradient boosting algorithms LightGBM, XGBoost, and CatBoost +yield the best results. After further testing and extensive hyperparameter +optimization, the final method choice is the LightGBM algorithm. Depending on +feature availability and prediction interval granularity, relative prediction +accuracies of up to 90% can be achieved. Further tests highlight the importance +of periodic retraining of AI models to accurately represent complex production +processes using the database. The research demonstrates that AI methods can be +effectively applied to highly variable production data, adding business value +by providing an additional metric for various control tasks while outperforming +current non AI-based systems. + +
+
+ comment: 7 pages, 4 figures, CLC2024 Conference +
+
+
+
+
+ + ☆ ViewVR: Visual Feedback Modes to Achieve Quality of VR-based + Telemanipulation + + +
+ The paper focuses on an immersive teleoperation system that enhances +operator's ability to actively perceive the robot's surroundings. A +consumer-grade HTC Vive VR system was used to synchronize the operator's hand +and head movements with a UR3 robot and a custom-built robotic head with two +degrees of freedom (2-DoF). The system's usability, manipulation efficiency, +and intuitiveness of control were evaluated in comparison with static head +camera positioning across three distinct tasks. Code and other supplementary +materials can be accessed by link: https://github.com/ErkhovArtem/ViewVR + +
+
+
+
+
+ + ☆ GestLLM: Advanced Hand Gesture Interpretation via Large Language Models + for Human-Robot Interaction + + +
+ This paper introduces GestLLM, an advanced system for human-robot interaction +that enables intuitive robot control through hand gestures. Unlike conventional +systems, which rely on a limited set of predefined gestures, GestLLM leverages +large language models and feature extraction via MediaPipe to interpret a +diverse range of gestures. This integration addresses key limitations in +existing systems, such as restricted gesture flexibility and the inability to +recognize complex or unconventional gestures commonly used in human +communication. + By combining state-of-the-art feature extraction and language model +capabilities, GestLLM achieves performance comparable to leading +vision-language models while supporting gestures underrepresented in +traditional datasets. For example, this includes gestures from popular culture, +such as the ``Vulcan salute" from Star Trek, without any additional +pretraining, prompt engineering, etc. This flexibility enhances the naturalness +and inclusivity of robot control, making interactions more intuitive and +user-friendly. + GestLLM provides a significant step forward in gesture-based interaction, +enabling robots to understand and respond to a wide variety of hand gestures +effectively. This paper outlines its design, implementation, and evaluation, +demonstrating its potential applications in advanced human-robot collaboration, +assistive robotics, and interactive entertainment. + +
+
+
+
+
+ + ☆ PO-GVINS: Tightly Coupled GNSS-Visual-Inertial Integration with + Pose-Only Representation + + +
+ Accurate and reliable positioning is crucial for perception, decision-making, +and other high-level applications in autonomous driving, unmanned aerial +vehicles, and intelligent robots. Given the inherent limitations of standalone +sensors, integrating heterogeneous sensors with complementary capabilities is +one of the most effective approaches to achieving this goal. In this paper, we +propose a filtering-based, tightly coupled global navigation satellite system +(GNSS)-visual-inertial positioning framework with a pose-only formulation +applied to the visual-inertial system (VINS), termed PO-GVINS. Specifically, +multiple-view imaging used in current VINS requires a priori of 3D feature, +then jointly estimate camera poses and 3D feature position, which inevitably +introduces linearization error of the feature as well as facing dimensional +explosion. However, the pose-only (PO) formulation, which is demonstrated to be +equivalent to the multiple-view imaging and has been applied in visual +reconstruction, represent feature depth using two camera poses and thus 3D +feature position is removed from state vector avoiding aforementioned +difficulties. Inspired by this, we first apply PO formulation in our VINS, +i.e., PO-VINS. GNSS raw measurements are then incorporated with integer +ambiguity resolved to achieve accurate and drift-free estimation. Extensive +experiments demonstrate that the proposed PO-VINS significantly outperforms the +multi-state constrained Kalman filter (MSCKF). By incorporating GNSS +measurements, PO-GVINS achieves accurate, drift-free state estimation, making +it a robust solution for positioning in challenging environments. + +
+
+
+
+
+ + ☆ GazeGrasp: DNN-Driven Robotic Grasping with Wearable Eye-Gaze Interface + + +
+ We present GazeGrasp, a gaze-based manipulation system enabling individuals +with motor impairments to control collaborative robots using eye-gaze. The +system employs an ESP32 CAM for eye tracking, MediaPipe for gaze detection, and +YOLOv8 for object localization, integrated with a Universal Robot UR10 for +manipulation tasks. After user-specific calibration, the system allows +intuitive object selection with a magnetic snapping effect and robot control +via eye gestures. Experimental evaluation involving 13 participants +demonstrated that the magnetic snapping effect significantly reduced gaze +alignment time, improving task efficiency by 31%. GazeGrasp provides a robust, +hands-free interface for assistive robotics, enhancing accessibility and +autonomy for users. + +
+
+ comment: Accepted to: IEEE/ACM International Conference on Human-Robot + Interaction (HRI 2025) +
+
+
+
+
+ + ☆ Touched by ChatGPT: Using an LLM to Drive Affective Tactile Interaction + + +
+ Touch is a fundamental aspect of emotion-rich communication, playing a vital +role in human interaction and offering significant potential in human-robot +interaction. Previous research has demonstrated that a sparse representation of +human touch can effectively convey social tactile signals. However, advances in +human-robot tactile interaction remain limited, as many humanoid robots possess +simplistic capabilities, such as only opening and closing their hands, +restricting nuanced tactile expressions. In this study, we explore how a robot +can use sparse representations of tactile vibrations to convey emotions to a +person. To achieve this, we developed a wearable sleeve integrated with a 5x5 +grid of vibration motors, enabling the robot to communicate diverse tactile +emotions and gestures. Using chain prompts within a Large Language Model (LLM), +we generated distinct 10-second vibration patterns corresponding to 10 emotions +(e.g., happiness, sadness, fear) and 6 touch gestures (e.g., pat, rub, tap). +Participants (N = 32) then rated each vibration stimulus based on perceived +valence and arousal. People are accurate at recognising intended emotions, a +result which aligns with earlier findings. These results highlight the LLM's +ability to generate emotional haptic data and effectively convey emotions +through tactile signals. By translating complex emotional and tactile +expressions into vibratory patterns, this research demonstrates how LLMs can +enhance physical interaction between humans and robots. + +
+
+
+
+
+ + ☆ Improving Incremental Nonlinear Dynamic Inversion Robustness Using + Robust Control in Aerial Robotics + + +
+ Improving robustness to uncertainty and rejection of external disturbances +represents a significant challenge in aerial robotics. Nonlinear controllers +based on Incremental Nonlinear Dynamic Inversion (INDI), known for their +ability in estimating disturbances through measured-filtered data, have been +notably used in such applications. Typically, these controllers comprise two +cascaded loops: an inner loop employing nonlinear dynamic inversion and an +outer loop generating the virtual control inputs via linear controllers. In +this paper, a novel methodology is introduced, that combines the advantages of +INDI with the robustness of linear structured $\mathcal{H}_\infty$ controllers. +A full cascaded architecture is proposed to control the dynamics of a +multirotor drone, covering both stabilization and guidance. In particular, +low-order $\mathcal{H}_\infty$ controllers are designed for the outer loop by +properly structuring the problem and solving it through non-smooth +optimization. A comparative analysis is conducted between an existing INDI/PD +approach and the proposed INDI/$\mathcal{H}_\infty$ strategy, showing a notable +enhancement in the rejection of external disturbances. It is carried out first +using MATLAB simulations involving a nonlinear model of a Parrot Bebop +quadcopter drone, and then experimentally using a customized quadcopter built +by the ENAC team. The results show an improvement of more than 50\% in the +rejection of disturbances such as gusts. + +
+
+
+
+
+ + ☆ Temperature Driven Multi-modal/Single-actuated Soft Finger + + +
+ Soft pneumatic fingers are of great research interest. However, their +significant potential is limited as most of them can generate only one motion, +mostly bending. The conventional design of soft fingers does not allow them to +switch to another motion mode. In this paper, we developed a novel multi-modal +and single-actuated soft finger where its motion mode is switched by changing +the finger's temperature. Our soft finger is capable of switching between three +distinctive motion modes: bending, twisting, and extension-in approximately +five seconds. We carried out a detailed experimental study of the soft finger +and evaluated its repeatability and range of motion. It exhibited repeatability +of around one millimeter and a fifty percent larger range of motion than a +standard bending actuator. We developed an analytical model for a +fiber-reinforced soft actuator for twisting motion. This helped us relate the +input pressure to the output twist radius of the twisting motion. This model +was validated by experimental verification. Further, a soft robotic gripper +with multiple grasp modes was developed using three actuators. This gripper can +adapt to and grasp objects of a large range of size, shape, and stiffness. We +showcased its grasping capabilities by successfully grasping a small berry, a +large roll, and a delicate tofu cube. + +
+
+
+
+
+ + ☆ Multi-face emotion detection for effective Human-Robot Interaction + + +
+ The integration of dialogue interfaces in mobile devices has become +ubiquitous, providing a wide array of services. As technology progresses, +humanoid robots designed with human-like features to interact effectively with +people are gaining prominence, and the use of advanced human-robot dialogue +interfaces is continually expanding. In this context, emotion recognition plays +a crucial role in enhancing human-robot interaction by enabling robots to +understand human intentions. This research proposes a facial emotion detection +interface integrated into a mobile humanoid robot, capable of displaying +real-time emotions from multiple individuals on a user interface. To this end, +various deep neural network models for facial expression recognition were +developed and evaluated under consistent computer-based conditions, yielding +promising results. Afterwards, a trade-off between accuracy and memory +footprint was carefully considered to effectively implement this application on +a mobile humanoid robot. + +
+
+ comment: 9 pages, 8 figures and 1 table. Accepted at the 17th International + Conference on Agents and Artificial Intelligence (ICAART 2025), Porto, + Portugal +
+
+
+
+
+ + ☆ Evaluating Robotic Approach Techniques for the Insertion of a Straight + Instrument into a Vitreoretinal Surgery Trocar + + +
+ Advances in vitreoretinal robotic surgery enable precise techniques for gene +therapies. This study evaluates three robotic approaches using the 7-DoF +robotic arm for docking a micro-precise tool to a trocar: fully co-manipulated, +hybrid co-manipulated/teleoperated, and hybrid with camera assistance. The +fully co-manipulated approach was the fastest but had a 42% success rate. +Hybrid methods showed higher success rates (91.6% and 100%) and completed tasks +within 2 minutes. NASA Task Load Index (TLX) assessments indicated lower +physical demand and effort for hybrid approaches. + +
+
+ comment: 2 Pages, 2 Figures, 1 Table +
+
+
+
+
+ + ☆ ROSAnnotator: A Web Application for ROSBag Data Analysis in Human-Robot + Interaction + + +
+ Human-robot interaction (HRI) is an interdisciplinary field that utilises +both quantitative and qualitative methods. While ROSBags, a file format within +the Robot Operating System (ROS), offer an efficient means of collecting +temporally synched multimodal data in empirical studies with real robots, there +is a lack of tools specifically designed to integrate qualitative coding and +analysis functions with ROSBags. To address this gap, we developed +ROSAnnotator, a web-based application that incorporates a multimodal Large +Language Model (LLM) to support both manual and automated annotation of ROSBag +data. ROSAnnotator currently facilitates video, audio, and transcription +annotations and provides an open interface for custom ROS messages and tools. +By using ROSAnnotator, researchers can streamline the qualitative analysis +process, create a more cohesive analysis pipeline, and quickly access +statistical summaries of annotations, thereby enhancing the overall efficiency +of HRI data analysis. https://github.com/CHRI-Lab/ROSAnnotator + +
+
+ comment: Accepted to HRI 2025 +
+
+
+
+
+ + ☆ Sthymuli: a Static Educational Robot. Leveraging the Thymio II Platform ICRA40 + + +
+ The use of robots in education represents a challenge for teachers and a +fixed vision of what robots can do for students. This paper presents the +development of Sthymuli, a static educational robot designed to explore new +classroom interactions between robots, students and teachers. We propose the +use of the Thymio II educational platform as a base, ensuring a robust +benchmark for a fair comparison of the commonly available wheeled robots and +our exploratory approach with Sthymuli. This paper outlines the constraints and +requirements for developing such a robot, the current state of development and +future work. + +
+
+ comment: Two pages, three figures. ICRA40 extended abstract +
+
+
+
+
+ + ☆ Motion Tracks: A Unified Representation for Human-Robot Transfer in + Few-Shot Imitation Learning + + +
+ Teaching robots to autonomously complete everyday tasks remains a challenge. +Imitation Learning (IL) is a powerful approach that imbues robots with skills +via demonstrations, but is limited by the labor-intensive process of collecting +teleoperated robot data. Human videos offer a scalable alternative, but it +remains difficult to directly train IL policies from them due to the lack of +robot action labels. To address this, we propose to represent actions as +short-horizon 2D trajectories on an image. These actions, or motion tracks, +capture the predicted direction of motion for either human hands or robot +end-effectors. We instantiate an IL policy called Motion Track Policy (MT-pi) +which receives image observations and outputs motion tracks as actions. By +leveraging this unified, cross-embodiment action space, MT-pi completes tasks +with high success given just minutes of human video and limited additional +robot demonstrations. At test time, we predict motion tracks from two camera +views, recovering 6DoF trajectories via multi-view synthesis. MT-pi achieves an +average success rate of 86.5% across 4 real-world tasks, outperforming +state-of-the-art IL baselines which do not leverage human data or our action +space by 40%, and generalizes to scenarios seen only in human videos. Code and +videos are available on our website +https://portal-cornell.github.io/motion_track_policy/. + +
+
+
+
+
+ + ☆ Hand-Object Contact Detection using Grasp Quality Metrics + + +
+ We propose a novel hand-object contact detection system based on grasp +quality metrics extracted from object and hand poses, and evaluated its +performance using the DexYCB dataset. Our evaluation demonstrated the system's +high accuracy (approaching 90%). Future work will focus on a real-time +implementation using vision-based estimation, and integrating it to a +robot-to-human handover system. + +
+
+ comment: Submitted to the 2025 IEEE/ACM International Conference on + Human-Robot Interaction (HRI'25) +
+
+
+
+
+ + ☆ Testing Human-Hand Segmentation on In-Distribution and + Out-of-Distribution Data in Human-Robot Interactions Using a Deep Ensemble + Model + + +
+ Reliable detection and segmentation of human hands are critical for enhancing +safety and facilitating advanced interactions in human-robot collaboration. +Current research predominantly evaluates hand segmentation under +in-distribution (ID) data, which reflects the training data of deep learning +(DL) models. However, this approach fails to address out-of-distribution (OOD) +scenarios that often arise in real-world human-robot interactions. In this +study, we present a novel approach by evaluating the performance of pre-trained +DL models under both ID data and more challenging OOD scenarios. To mimic +realistic industrial scenarios, we designed a diverse dataset featuring simple +and cluttered backgrounds with industrial tools, varying numbers of hands (0 to +4), and hands with and without gloves. For OOD scenarios, we incorporated +unique and rare conditions such as finger-crossing gestures and motion blur +from fast-moving hands, addressing both epistemic and aleatoric uncertainties. +To ensure multiple point of views (PoVs), we utilized both egocentric cameras, +mounted on the operator's head, and static cameras to capture RGB images of +human-robot interactions. This approach allowed us to account for multiple +camera perspectives while also evaluating the performance of models trained on +existing egocentric datasets as well as static-camera datasets. For +segmentation, we used a deep ensemble model composed of UNet and RefineNet as +base learners. Performance evaluation was conducted using segmentation metrics +and uncertainty quantification via predictive entropy. Results revealed that +models trained on industrial datasets outperformed those trained on +non-industrial datasets, highlighting the importance of context-specific +training. Although all models struggled with OOD scenarios, those trained on +industrial datasets demonstrated significantly better generalization. + +
+
+
+
+
+ + ☆ Autonomous Electrochemistry Platform with Real-Time Normality Testing of + Voltammetry Measurements Using ML + + +
+ Electrochemistry workflows utilize various instruments and computing systems +to execute workflows consisting of electrocatalyst synthesis, testing and +evaluation tasks. The heterogeneity of the software and hardware of these +ecosystems makes it challenging to orchestrate a complete workflow from +production to characterization by automating its tasks. We propose an +autonomous electrochemistry computing platform for a multi-site ecosystem that +provides the services for remote experiment steering, real-time measurement +transfer, and AI/ML-driven analytics. We describe the integration of a mobile +robot and synthesis workstation into the ecosystem by developing custom +hub-networks and software modules to support remote operations over the +ecosystem's wireless and wired networks. We describe a workflow task for +generating I-V voltammetry measurements using a potentiostat, and a machine +learning framework to ensure their normality by detecting abnormal conditions +such as disconnected electrodes. We study a number of machine learning methods +for the underlying detection problem, including smooth, non-smooth, structural +and statistical methods, and their fusers. We present experimental results to +illustrate the effectiveness of this platform, and also validate the proposed +ML method by deriving its rigorous generalization equations. + +
+
+ comment: 10 pages, 14 figures, accepted in the IEEE 20th International + Conference on e-Science (e-Science), 2024 +
+
+
+
+
+ + ♻ ☆ Few-Shot Task Learning through Inverse Generative Modeling + + +
+ Learning the intents of an agent, defined by its goals or motion style, is +often extremely challenging from just a few examples. We refer to this problem +as task concept learning and present our approach, Few-Shot Task Learning +through Inverse Generative Modeling (FTL-IGM), which learns new task concepts +by leveraging invertible neural generative models. The core idea is to pretrain +a generative model on a set of basic concepts and their demonstrations. Then, +given a few demonstrations of a new concept (such as a new goal or a new +action), our method learns the underlying concepts through backpropagation +without updating the model weights, thanks to the invertibility of the +generative model. We evaluate our method in five domains -- object +rearrangement, goal-oriented navigation, motion caption of human actions, +autonomous driving, and real-world table-top manipulation. Our experimental +results demonstrate that via the pretrained generative model, we successfully +learn novel concepts and generate agent plans or motion corresponding to these +concepts in (1) unseen environments and (2) in composition with training +concepts. + +
+
+ comment: Added acknowledgment +
+
+
+
+
+ + ♻ ☆ Accelerating genetic optimization of nonlinear model predictive control + by learning optimal search space size + + +
+ Genetic algorithm (GA) is typically used to solve nonlinear model predictive +control's optimization problem. However, the size of the search space in which +the GA searches for the optimal control inputs is crucial for its applicability +to fast-response systems. This paper proposes accelerating the genetic +optimization of NMPC by learning optimal search space size. The approach trains +a multivariate regression model to adaptively predict the best smallest size of +the search space in every control cycle. The proposed approach reduces the GA's +computational time, improves the chance of convergence to better control +inputs, and provides a stable and feasible solution. The proposed approach was +evaluated on three nonlinear systems and compared to four other evolutionary +algorithms implemented in a processor-in-the-loop fashion. The results show +that the proposed approach provides a 17-45\% reduction in computational time +and increases the convergence rate by 35-47\%. The source code is available on +GitHub. + +
+
+ comment: Accepted by the Journal of Control and Decision +
+
+
+
+
+ + ♻ ☆ Geometric Freeze-Tag Problem + + +
+ We study the Freeze-Tag Problem (FTP), introduced by Arkin et al. (SODA'02), +where the objective is to activate a group of n robots, starting from a single +initially active robot. Robots are positioned in $\mathbb{R}^d$, and once +activated, they move at a constant speed to wake up others. The goal is to +minimize the time required to activate the last robot, known as the makespan. +We establish new upper bounds for the makespan under the $l_1$ and $l_2$ norms +in $\mathbb{R}^2$ and $\mathbb{R}^3$. Specifically, we improve the previous +upper bound for $(\mathbb{R}^2, l_2)$ from $7.07r$ (Bonichon et al., DISC'24) +to $5.064r$. For $(\mathbb{R}^3, l_1)$, we derive a makespan bound of $13r$, +which translates to $22.52r$ for $(\mathbb{R}^3, l_2)$. Here, $r$ denotes the +maximum distance of any robot from the initially active robot under the given +norm. To our knowledge, these are the first makespan bounds for FTP in +$\mathbb{R}^3$. Additionally, we show that the maximum makespan for $n$ robots +is not necessarily achieved when robots are equally distributed along the +boundary in $(\mathbb{R}^2, l_2)$. We further investigate FTP in +$(\mathbb{R}^3, l_2)$ for specific configurations where robots lie on a +boundary, providing insights into practical scenarios. + +
+
+
+
+
+ + ♻ ☆ QuadWBG: Generalizable Quadrupedal Whole-Body Grasping + + +
+ Legged robots with advanced manipulation capabilities have the potential to +significantly improve household duties and urban maintenance. Despite +considerable progress in developing robust locomotion and precise manipulation +methods, seamlessly integrating these into cohesive whole-body control for +real-world applications remains challenging. In this paper, we present a +modular framework for robust and generalizable whole-body loco-manipulation +controller based on a single arm-mounted camera. By using reinforcement +learning (RL), we enable a robust low-level policy for command execution over 5 +dimensions (5D) and a grasp-aware high-level policy guided by a novel metric, +Generalized Oriented Reachability Map (GORM). The proposed system achieves +state-of-the-art one-time grasping accuracy of 89% in the real world, including +challenging tasks such as grasping transparent objects. Through extensive +simulations and real-world experiments, we demonstrate that our system can +effectively manage a large workspace, from floor level to above body height, +and perform diverse whole-body loco-manipulation tasks. + +
+
+
+
+
+ + ♻ ☆ SyncDiff: Synchronized Motion Diffusion for Multi-Body Human-Object + Interaction Synthesis + + +
+ Synthesizing realistic human-object interaction motions is a critical problem +in VR/AR and human animation. Unlike the commonly studied scenarios involving a +single human or hand interacting with one object, we address a more generic +multi-body setting with arbitrary numbers of humans, hands, and objects. This +complexity introduces significant challenges in synchronizing motions due to +the high correlations and mutual influences among bodies. To address these +challenges, we introduce SyncDiff, a novel method for multi-body interaction +synthesis using a synchronized motion diffusion strategy. SyncDiff employs a +single diffusion model to capture the joint distribution of multi-body motions. +To enhance motion fidelity, we propose a frequency-domain motion decomposition +scheme. Additionally, we introduce a new set of alignment scores to emphasize +the synchronization of different body motions. SyncDiff jointly optimizes both +data sample likelihood and alignment likelihood through an explicit +synchronization strategy. Extensive experiments across four datasets with +various multi-body configurations demonstrate the superiority of SyncDiff over +existing state-of-the-art motion synthesis methods. + +
+
+
+
+
+ + ♻ ☆ An Adaptive Sliding Window Estimator for Positioning of Unmanned Aerial + Vehicle Using a Single Anchor + + +
+ Localization using a single range anchor combined with onboard +optical-inertial odometry offers a lightweight solution that provides +multidimensional measurements for the positioning of unmanned aerial vehicles. +Unfortunately, the performance of such lightweight sensors varies with the +dynamic environment, and the fidelity of the dynamic model is also severely +affected by environmental aerial flow. To address this challenge, we propose an +adaptive sliding window estimator equipped with an estimation reliability +evaluator, where the states, noise covariance matrices and aerial drag are +estimated simultaneously. The aerial drag effects are first evaluated based on +posterior states and covariance. Then, an augmented Kalman filter is designed +to pre-process multidimensional measurements and inherit historical +information. Subsequently, an inverse-Wishart smoother is employed to estimate +posterior states and covariance matrices. To further suppress potential +divergence, a reliability evaluator is devised to infer estimation errors. We +further determine the fidelity of each sensor based on the error propagation. +Extensive experiments are conducted in both standard and harsh environments, +demonstrating the adaptability and robustness of the proposed method. The root +mean square error reaches 0.15 m, outperforming the state-of-the-art approach. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ♻ ☆ Walk along: An Experiment on Controlling the Mobile Robot 'Spot' with + Voice and Gestures + + +
+ Robots are becoming more capable and can autonomously perform tasks such as +navigating between locations. However, human oversight remains crucial. This +study compared two touchless methods for directing mobile robots: voice control +and gesture control, to investigate the efficiency of the methods and the +preference of users. We tested these methods in two conditions: one in which +participants remained stationary and one in which they walked freely alongside +the robot. We hypothesized that walking alongside the robot would result in +higher intuitiveness ratings and improved task performance, based on the idea +that walking promotes spatial alignment and reduces the effort required for +mental rotation. In a 2x2 within-subject design, 218 participants guided the +quadruped robot Spot along a circuitous route with multiple 90-degree turns +using rotate left, rotate right, and walk forward commands. After each trial, +participants rated the intuitiveness of the command mapping, while +post-experiment interviews were used to gather the participants' preferences. +Results showed that voice control combined with walking with Spot was the most +favored and intuitive, whereas gesture control while standing caused confusion +for left/right commands. Nevertheless, 29% of participants preferred gesture +control, citing increased task engagement and visual congruence as reasons. An +odometry-based analysis revealed that participants often followed behind Spot, +particularly in the gesture control condition, when they were allowed to walk. +In conclusion, voice control with walking produced the best outcomes. Improving +physical ergonomics and adjusting gesture types could make gesture control more +effective. + +
+
+
+
+
+ + ♻ ☆ Adaptive Non-linear Centroidal MPC with Stability Guarantees for Robust + Locomotion of Legged Robots + + +
+ Nonlinear model predictive locomotion controllers based on the reduced +centroidal dynamics are nowadays ubiquitous in legged robots. These schemes, +even if they assume an inherent simplification of the robot's dynamics, were +shown to endow robots with a step-adjustment capability in reaction to small +pushes, and, moreover, in the case of uncertain parameters - as unknown +payloads - they were shown to be able to provide some practical, albeit +limited, robustness. In this work, we provide rigorous certificates of their +closed loop stability via a reformulation of the centroidal MPC controller. +This is achieved thanks to a systematic procedure inspired by the machinery of +adaptive control, together with ideas coming from Control Lyapunov functions. +Our reformulation, in addition, provides robustness for a class of unmeasured +constant disturbances. To demonstrate the generality of our approach, we +validated our formulation on a new generation of humanoid robots - the 56.7 kg +ergoCub, as well as on a commercially available 21 kg quadruped robot, Aliengo. + +
+
+
+
+
+ + ♻ ☆ From Underground Mines to Offices: A Versatile and Robust Framework for + Range-Inertial SLAM + + +
+ Simultaneous Localization and Mapping (SLAM) is an essential component of +autonomous robotic applications and self-driving vehicles, enabling them to +understand and operate in their environment. Many SLAM systems have been +proposed in the last decade, but they are often complex to adapt to different +settings or sensor setups. In this work, we present LiDAR Graph-SLAM (LG-SLAM), +a versatile range-inertial SLAM framework that can be adapted to different +types of sensors and environments, from underground mines to offices with +minimal parameter tuning. Our system integrates range, inertial and GNSS +measurements into a graph-based optimization framework. We also use a refined +submap management approach and a robust loop closure method that effectively +accounts for uncertainty in the identification and validation of putative loop +closures, ensuring global consistency and robustness. Enabled by a parallelized +architecture and GPU integration, our system achieves pose estimation at LiDAR +frame rate, along with online loop closing and graph optimization. We validate +our system in diverse environments using public datasets and real-world data, +consistently achieving an average error below 20 cm and outperforming other +state-of-the-art algorithms. + +
+
+ comment: 8 pages, 8 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ LLaMAR: Long-Horizon Planning for Multi-Agent Robots in Partially + Observable Environments + + +
+ The ability of Language Models (LMs) to understand natural language makes +them a powerful tool for parsing human instructions into task plans for +autonomous robots. Unlike traditional planning methods that rely on +domain-specific knowledge and handcrafted rules, LMs generalize from diverse +data and adapt to various tasks with minimal tuning, acting as a compressed +knowledge base. However, LMs in their standard form face challenges with +long-horizon tasks, particularly in partially observable multi-agent settings. +We propose an LM-based Long-Horizon Planner for Multi-Agent Robotics (LLaMAR), +a cognitive architecture for planning that achieves state-of-the-art results in +long-horizon tasks within partially observable environments. LLaMAR employs a +plan-act-correct-verify framework, allowing self-correction from action +execution feedback without relying on oracles or simulators. Additionally, we +present MAP-THOR, a comprehensive test suite encompassing household tasks of +varying complexity within the AI2-THOR environment. Experiments show that +LLaMAR achieves a 30% higher success rate than other state-of-the-art LM-based +multi-agent planners in MAP-THOR and Search \& Rescue tasks. Code can be found +at https://github.com/nsidn98/LLaMAR + +
+
+ comment: 27 pages, 4 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Map Imagination Like Blind Humans: Group Diffusion Model for Robotic Map + Generation + + +
+ Can robots imagine or generate maps like humans do, especially when only +limited information can be perceived like blind people? To address this +challenging task, we propose a novel group diffusion model (GDM) based +architecture for robots to generate point cloud maps with very limited input +information.Inspired from the blind humans' natural capability of imagining or +generating mental maps, the proposed method can generate maps without visual +perception data or depth data. With additional limited super-sparse spatial +positioning data, like the extra contact-based positioning information the +blind individuals can obtain, the map generation quality can be improved even +more.Experiments on public datasets are conducted, and the results indicate +that our method can generate reasonable maps solely based on path data, and +produce even more refined maps upon incorporating exiguous LiDAR data.Compared +to conventional mapping approaches, our novel method significantly mitigates +sensor dependency, enabling the robots to imagine and generate elementary maps +without heavy onboard sensory devices. + +
+
+
+
+
+ + ♻ ☆ Robot Error Awareness Through Human Reactions: Implementation, + Evaluation, and Recommendations + + +
+ Effective error detection is crucial to prevent task disruption and maintain +user trust. Traditional methods often rely on task-specific models or user +reporting, which can be inflexible or slow. Recent research suggests social +signals, naturally exhibited by users in response to robot errors, can enable +more flexible, timely error detection. However, most studies rely on post hoc +analysis, leaving their real-time effectiveness uncertain and lacking +user-centric evaluation. In this work, we developed a proactive error detection +system that combines user behavioral signals (facial action units and speech), +user feedback, and error context for automatic error detection. In a study (N = +28), we compared our proactive system to a status quo reactive approach. +Results show our system 1) reliably and flexibly detects error, 2) detects +errors faster than the reactive approach, and 3) is perceived more favorably by +users than the reactive one. We discuss recommendations for enabling robot +error awareness in future HRI systems. + +
+
+
+
+
+ + ♻ ☆ Efficient Estimation of Relaxed Model Parameters for Robust UAV + Trajectory Optimization + + +
+ Online trajectory optimization and optimal control methods are crucial for +enabling sustainable unmanned aerial vehicle (UAV) services, such as +agriculture, environmental monitoring, and transportation, where available +actuation and energy are limited. However, optimal controllers are highly +sensitive to model mismatch, which can occur due to loaded equipment, packages +to be delivered, or pre-existing variability in fundamental structural and +thrust-related parameters. To circumvent this problem, optimal controllers can +be paired with parameter estimators to improve their trajectory planning +performance and perform adaptive control. However, UAV platforms are limited in +terms of onboard processing power, oftentimes making nonlinear parameter +estimation too computationally expensive to consider. To address these issues, +we propose a relaxed, affine-in-parameters multirotor model along with an +efficient optimal parameter estimator. We convexify the nominal Moving Horizon +Parameter Estimation (MHPE) problem into a linear-quadratic form (LQ-MHPE) via +an affine-in-parameter relaxation on the nonlinear dynamics, resulting in fast +quadratic programs (QPs) that facilitate adaptive Model Predictve Control (MPC) +in real time. We compare this approach to the equivalent nonlinear estimator in +Monte Carlo simulations, demonstrating a decrease in average solve time and +trajectory optimality cost by 98.2% and 23.9-56.2%, respectively. + +
+
+ comment: 8 pages, 5 figures, to be published in IEEE Sustech 2025 +
+
+
+
+
+ + ♻ ☆ A Mixed-Integer Conic Program for the Moving-Target Traveling Salesman + Problem based on a Graph of Convex Sets + + +
+ This paper introduces a new formulation that finds the optimum for the +Moving-Target Traveling Salesman Problem (MT-TSP), which seeks to find a +shortest path for an agent, that starts at a depot, visits a set of moving +targets exactly once within their assigned time-windows, and returns to the +depot. The formulation relies on the key idea that when the targets move along +lines, their trajectories become convex sets within the space-time coordinate +system. The problem then reduces to finding the shortest path within a graph of +convex sets, subject to some speed constraints. We compare our formulation with +the current state-of-the-art Mixed Integer Conic Program (MICP) solver for the +MT-TSP. The experimental results show that our formulation outperforms the MICP +for instances with up to 20 targets, with up to two orders of magnitude +reduction in runtime, and up to a 60\% tighter optimality gap. We also show +that the solution cost from the convex relaxation of our formulation provides +significantly tighter lower bounds for the MT-TSP than the ones from the MICP. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Exploiting Chordal Sparsity for Fast Global Optimality with Application + to Localization + + +
+ In recent years, many estimation problems in robotics have been shown to be +solvable to global optimality using their semidefinite relaxations. However, +the runtime complexity of off-the-shelf semidefinite programming (SDP) solvers +is up to cubic in problem size, which inhibits real-time solutions of problems +involving large state dimensions. We show that for a large class of problems, +namely those with chordal sparsity, we can reduce the complexity of these +solvers to linear in problem size. In particular, we show how to replace the +large positive-semidefinite variable with a number of smaller interconnected +ones using the well-known chordal decomposition. This formulation also allows +for the straightforward application of the alternating direction method of +multipliers (ADMM), which can exploit parallelism for increased scalability. We +show for two example problems in simulation that the chordal solvers provide a +significant speed-up over standard SDP solvers, and that global optimality is +crucial in the absence of good initializations. + +
+
+ comment: 21 pages, 6 figures. Version history: v1: initial arXiv, v2: WAFR + submission, v3: correction, v4: WAFR conference-ready, v5: WAFR SPAR journal + version +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 141 + +
+
+
+ + ☆ Dataset Distillation via Committee Voting + + +
+ Dataset distillation aims to synthesize a smaller, representative dataset +that preserves the essential properties of the original data, enabling +efficient model training with reduced computational resources. Prior work has +primarily focused on improving the alignment or matching process between +original and synthetic data, or on enhancing the efficiency of distilling large +datasets. In this work, we introduce ${\bf C}$ommittee ${\bf V}$oting for ${\bf +D}$ataset ${\bf D}$istillation (CV-DD), a novel and orthogonal approach that +leverages the collective wisdom of multiple models or experts to create +high-quality distilled datasets. We start by showing how to establish a strong +baseline that already achieves state-of-the-art accuracy through leveraging +recent advancements and thoughtful adjustments in model design and optimization +processes. By integrating distributions and predictions from a committee of +models while generating high-quality soft labels, our method captures a wider +spectrum of data features, reduces model-specific biases and the adverse +effects of distribution shifts, leading to significant improvements in +generalization. This voting-based strategy not only promotes diversity and +robustness within the distilled dataset but also significantly reduces +overfitting, resulting in improved performance on post-eval tasks. Extensive +experiments across various datasets and IPCs (images per class) demonstrate +that Committee Voting leads to more reliable and adaptable distilled data +compared to single/multi-model distillation methods, demonstrating its +potential for efficient and accurate dataset distillation. Code is available +at: https://github.com/Jiacheng8/CV-DD. + +
+
+ comment: Code at: https://github.com/Jiacheng8/CV-DD +
+
+
+
+
+ + ☆ UnCommon Objects in 3D + + +
+ We introduce Uncommon Objects in 3D (uCO3D), a new object-centric dataset for +3D deep learning and 3D generative AI. uCO3D is the largest publicly-available +collection of high-resolution videos of objects with 3D annotations that +ensures full-360$^{\circ}$ coverage. uCO3D is significantly more diverse than +MVImgNet and CO3Dv2, covering more than 1,000 object categories. It is also of +higher quality, due to extensive quality checks of both the collected videos +and the 3D annotations. Similar to analogous datasets, uCO3D contains +annotations for 3D camera poses, depth maps and sparse point clouds. In +addition, each object is equipped with a caption and a 3D Gaussian Splat +reconstruction. We train several large 3D models on MVImgNet, CO3Dv2, and uCO3D +and obtain superior results using the latter, showing that uCO3D is better for +learning applications. + +
+
+
+
+
+ + ☆ Training-Free Motion-Guided Video Generation with Enhanced Temporal + Consistency Using Motion Consistency Loss + + +
+ In this paper, we address the challenge of generating temporally consistent +videos with motion guidance. While many existing methods depend on additional +control modules or inference-time fine-tuning, recent studies suggest that +effective motion guidance is achievable without altering the model architecture +or requiring extra training. Such approaches offer promising compatibility with +various video generation foundation models. However, existing training-free +methods often struggle to maintain consistent temporal coherence across frames +or to follow guided motion accurately. In this work, we propose a simple yet +effective solution that combines an initial-noise-based approach with a novel +motion consistency loss, the latter being our key innovation. Specifically, we +capture the inter-frame feature correlation patterns of intermediate features +from a video diffusion model to represent the motion pattern of the reference +video. We then design a motion consistency loss to maintain similar feature +correlation patterns in the generated video, using the gradient of this loss in +the latent space to guide the generation process for precise motion control. +This approach improves temporal consistency across various motion control tasks +while preserving the benefits of a training-free setup. Extensive experiments +show that our method sets a new standard for efficient, temporally coherent +video generation. + +
+
+ comment: Project page: + https://zhangxinyu-xyz.github.io/SimulateMotion.github.io/ +
+
+
+
+
+ + ☆ MatchAnything: Universal Cross-Modality Image Matching with Large-Scale + Pre-Training + + +
+ Image matching, which aims to identify corresponding pixel locations between +images, is crucial in a wide range of scientific disciplines, aiding in image +registration, fusion, and analysis. In recent years, deep learning-based image +matching algorithms have dramatically outperformed humans in rapidly and +accurately finding large amounts of correspondences. However, when dealing with +images captured under different imaging modalities that result in significant +appearance changes, the performance of these algorithms often deteriorates due +to the scarcity of annotated cross-modal training data. This limitation hinders +applications in various fields that rely on multiple image modalities to obtain +complementary information. To address this challenge, we propose a large-scale +pre-training framework that utilizes synthetic cross-modal training signals, +incorporating diverse data from various sources, to train models to recognize +and match fundamental structures across images. This capability is transferable +to real-world, unseen cross-modality image matching tasks. Our key finding is +that the matching model trained with our framework achieves remarkable +generalizability across more than eight unseen cross-modality registration +tasks using the same network weight, substantially outperforming existing +methods, whether designed for generalization or tailored for specific tasks. +This advancement significantly enhances the applicability of image matching +technologies across various scientific disciplines and paves the way for new +applications in multi-modality human and artificial intelligence analysis and +beyond. + +
+
+ comment: Project page: https://zju3dv.github.io/MatchAnything/ +
+
+
+
+
+ + ☆ SST-EM: Advanced Metrics for Evaluating Semantic, Spatial and Temporal + Aspects in Video Editing + + +
+ Video editing models have advanced significantly, but evaluating their +performance remains challenging. Traditional metrics, such as CLIP text and +image scores, often fall short: text scores are limited by inadequate training +data and hierarchical dependencies, while image scores fail to assess temporal +consistency. We present SST-EM (Semantic, Spatial, and Temporal Evaluation +Metric), a novel evaluation framework that leverages modern Vision-Language +Models (VLMs), Object Detection, and Temporal Consistency checks. SST-EM +comprises four components: (1) semantic extraction from frames using a VLM, (2) +primary object tracking with Object Detection, (3) focused object refinement +via an LLM agent, and (4) temporal consistency assessment using a Vision +Transformer (ViT). These components are integrated into a unified metric with +weights derived from human evaluations and regression analysis. The name SST-EM +reflects its focus on Semantic, Spatial, and Temporal aspects of video +evaluation. SST-EM provides a comprehensive evaluation of semantic fidelity and +temporal smoothness in video editing. The source code is available in the +\textbf{\href{https://github.com/custommetrics-sst/SST_CustomEvaluationMetrics.git}{GitHub +Repository}}. + +
+
+ comment: WACV workshop +
+
+
+
+
+ + ☆ Imagine while Reasoning in Space: Multimodal Visualization-of-Thought + + +
+ Chain-of-Thought (CoT) prompting has proven highly effective for enhancing +complex reasoning in Large Language Models (LLMs) and Multimodal Large Language +Models (MLLMs). Yet, it struggles in complex spatial reasoning tasks. +Nonetheless, human cognition extends beyond language alone, enabling the +remarkable capability to think in both words and images. Inspired by this +mechanism, we propose a new reasoning paradigm, Multimodal +Visualization-of-Thought (MVoT). It enables visual thinking in MLLMs by +generating image visualizations of their reasoning traces. To ensure +high-quality visualization, we introduce token discrepancy loss into +autoregressive MLLMs. This innovation significantly improves both visual +coherence and fidelity. We validate this approach through several dynamic +spatial reasoning tasks. Experimental results reveal that MVoT demonstrates +competitive performance across tasks. Moreover, it exhibits robust and reliable +improvements in the most challenging scenarios where CoT fails. Ultimately, +MVoT establishes new possibilities for complex reasoning tasks where visual +thinking can effectively complement verbal reasoning. + +
+
+ comment: 11 pages, 6 figures, 4 tables (27 pages, 10 figures, 16 tables + including references and appendices) +
+
+
+
+
+ + ☆ Confident Pseudo-labeled Diffusion Augmentation for Canine Cardiomegaly + Detection + + +
+ Canine cardiomegaly, marked by an enlarged heart, poses serious health risks +if undetected, requiring accurate diagnostic methods. Current detection models +often rely on small, poorly annotated datasets and struggle to generalize +across diverse imaging conditions, limiting their real-world applicability. To +address these issues, we propose a Confident Pseudo-labeled Diffusion +Augmentation (CDA) model for identifying canine cardiomegaly. Our approach +addresses the challenge of limited high-quality training data by employing +diffusion models to generate synthetic X-ray images and annotate Vertebral +Heart Score key points, thereby expanding the dataset. We also employ a +pseudo-labeling strategy with Monte Carlo Dropout to select high-confidence +labels, refine the synthetic dataset, and improve accuracy. Iteratively +incorporating these labels enhances the model's performance, overcoming the +limitations of existing approaches. Experimental results show that the CDA +model outperforms traditional methods, achieving state-of-the-art accuracy in +canine cardiomegaly detection. The code implementation is available at +https://github.com/Shira7z/CDA. + +
+
+ comment: WACV workshop +
+
+
+
+
+ + ☆ IP-FaceDiff: Identity-Preserving Facial Video Editing with Diffusion + + +
+ Facial video editing has become increasingly important for content creators, +enabling the manipulation of facial expressions and attributes. However, +existing models encounter challenges such as poor editing quality, high +computational costs and difficulties in preserving facial identity across +diverse edits. Additionally, these models are often constrained to editing +predefined facial attributes, limiting their flexibility to diverse editing +prompts. To address these challenges, we propose a novel facial video editing +framework that leverages the rich latent space of pre-trained text-to-image +(T2I) diffusion models and fine-tune them specifically for facial video editing +tasks. Our approach introduces a targeted fine-tuning scheme that enables high +quality, localized, text-driven edits while ensuring identity preservation +across video frames. Additionally, by using pre-trained T2I models during +inference, our approach significantly reduces editing time by 80%, while +maintaining temporal consistency throughout the video sequence. We evaluate the +effectiveness of our approach through extensive testing across a wide range of +challenging scenarios, including varying head poses, complex action sequences, +and diverse facial expressions. Our method consistently outperforms existing +techniques, demonstrating superior performance across a broad set of metrics +and benchmarks. + +
+
+ comment: WACV-25 Workshop +
+
+
+
+
+ + ☆ RadAlign: Advancing Radiology Report Generation with Vision-Language + Concept Alignment + + +
+ Automated chest radiographs interpretation requires both accurate disease +classification and detailed radiology report generation, presenting a +significant challenge in the clinical workflow. Current approaches either focus +on classification accuracy at the expense of interpretability or generate +detailed but potentially unreliable reports through image captioning +techniques. In this study, we present RadAlign, a novel framework that combines +the predictive accuracy of vision-language models (VLMs) with the reasoning +capabilities of large language models (LLMs). Inspired by the radiologist's +workflow, RadAlign first employs a specialized VLM to align visual features +with key medical concepts, achieving superior disease classification with an +average AUC of 0.885 across multiple diseases. These recognized medical +conditions, represented as text-based concepts in the aligned visual-language +space, are then used to prompt LLM-based report generation. Enhanced by a +retrieval-augmented generation mechanism that grounds outputs in similar +historical cases, RadAlign delivers superior report quality with a GREEN score +of 0.678, outperforming state-of-the-art methods' 0.634. Our framework +maintains strong clinical interpretability while reducing hallucinations, +advancing automated medical imaging and report analysis through integrated +predictive and generative AI. Code is available at +https://github.com/difeigu/RadAlign. + +
+
+
+
+
+ + ☆ Three-view Focal Length Recovery From Homographies + + +
+ In this paper, we propose a novel approach for recovering focal lengths from +three-view homographies. By examining the consistency of normal vectors between +two homographies, we derive new explicit constraints between the focal lengths +and homographies using an elimination technique. We demonstrate that three-view +homographies provide two additional constraints, enabling the recovery of one +or two focal lengths. We discuss four possible cases, including three cameras +having an unknown equal focal length, three cameras having two different +unknown focal lengths, three cameras where one focal length is known, and the +other two cameras have equal or different unknown focal lengths. All the +problems can be converted into solving polynomials in one or two unknowns, +which can be efficiently solved using Sturm sequence or hidden variable +technique. Evaluation using both synthetic and real data shows that the +proposed solvers are both faster and more accurate than methods relying on +existing two-view solvers. The code and data are available on +https://github.com/kocurvik/hf + +
+
+ comment: Code available at https://github.com/kocurvik/hf Dataset available + at: https://doi.org/10.5281/zenodo.14638904 +
+
+
+
+
+ + ☆ Aligning First, Then Fusing: A Novel Weakly Supervised Multimodal + Violence Detection Method + + +
+ Weakly supervised violence detection refers to the technique of training +models to identify violent segments in videos using only video-level labels. +Among these approaches, multimodal violence detection, which integrates +modalities such as audio and optical flow, holds great potential. Existing +methods in this domain primarily focus on designing multimodal fusion models to +address modality discrepancies. In contrast, we take a different approach; +leveraging the inherent discrepancies across modalities in violence event +representation to propose a novel multimodal semantic feature alignment method. +This method sparsely maps the semantic features of local, transient, and less +informative modalities ( such as audio and optical flow ) into the more +informative RGB semantic feature space. Through an iterative process, the +method identifies the suitable no-zero feature matching subspace and aligns the +modality-specific event representations based on this subspace, enabling the +full exploitation of information from all modalities during the subsequent +modality fusion stage. Building on this, we design a new weakly supervised +violence detection framework that consists of unimodal multiple-instance +learning for extracting unimodal semantic features, multimodal alignment, +multimodal fusion, and final detection. Experimental results on benchmark +datasets demonstrate the effectiveness of our method, achieving an average +precision (AP) of 86.07% on the XD-Violence dataset. Our code is available at +https://github.com/xjpp2016/MAVD. + +
+
+
+
+
+ + ☆ 3DGS-to-PC: Convert a 3D Gaussian Splatting Scene into a Dense Point + Cloud or Mesh + + +
+ 3D Gaussian Splatting (3DGS) excels at producing highly detailed 3D +reconstructions, but these scenes often require specialised renderers for +effective visualisation. In contrast, point clouds are a widely used 3D +representation and are compatible with most popular 3D processing software, yet +converting 3DGS scenes into point clouds is a complex challenge. In this work +we introduce 3DGS-to-PC, a flexible and highly customisable framework that is +capable of transforming 3DGS scenes into dense, high-accuracy point clouds. We +sample points probabilistically from each Gaussian as a 3D density function. We +additionally threshold new points using the Mahalanobis distance to the +Gaussian centre, preventing extreme outliers. The result is a point cloud that +closely represents the shape encoded into the 3D Gaussian scene. Individual +Gaussians use spherical harmonics to adapt colours depending on view, and each +point may contribute only subtle colour hints to the resulting rendered scene. +To avoid spurious or incorrect colours that do not fit with the final point +cloud, we recalculate Gaussian colours via a customised image rendering +approach, assigning each Gaussian the colour of the pixel to which it +contributes most across all views. 3DGS-to-PC also supports mesh generation +through Poisson Surface Reconstruction, applied to points sampled from +predicted surface Gaussians. This allows coloured meshes to be generated from +3DGS scenes without the need for re-training. This package is highly +customisable and capability of simple integration into existing 3DGS pipelines. +3DGS-to-PC provides a powerful tool for converting 3DGS data into point cloud +and surface-based formats. + +
+
+
+
+
+ + ☆ A Survey on Dynamic Neural Networks: from Computer Vision to Multi-modal + Sensor Fusion + + +
+ Model compression is essential in the deployment of large Computer Vision +models on embedded devices. However, static optimization techniques (e.g. +pruning, quantization, etc.) neglect the fact that different inputs have +different complexities, thus requiring different amount of computations. +Dynamic Neural Networks allow to condition the number of computations to the +specific input. The current literature on the topic is very extensive and +fragmented. We present a comprehensive survey that synthesizes and unifies +existing Dynamic Neural Networks research in the context of Computer Vision. +Additionally, we provide a logical taxonomy based on which component of the +network is adaptive: the output, the computation graph or the input. +Furthermore, we argue that Dynamic Neural Networks are particularly beneficial +in the context of Sensor Fusion for better adaptivity, noise reduction and +information prioritization. We present preliminary works in this direction. + +
+
+ comment: Under review at International Journal of Computer Vision +
+
+
+
+
+ + ☆ PrecipDiff: Leveraging image diffusion models to enhance satellite-based + precipitation observations + + +
+ A recent report from the World Meteorological Organization (WMO) highlights +that water-related disasters have caused the highest human losses among natural +disasters over the past 50 years, with over 91\% of deaths occurring in +low-income countries. This disparity is largely due to the lack of adequate +ground monitoring stations, such as weather surveillance radars (WSR), which +are expensive to install. For example, while the US and Europe combined possess +over 600 WSRs, Africa, despite having almost one and half times their landmass, +has fewer than 40. To address this issue, satellite-based observations offer a +global, near-real-time monitoring solution. However, they face several +challenges like accuracy, bias, and low spatial resolution. This study +leverages the power of diffusion models and residual learning to address these +limitations in a unified framework. We introduce the first diffusion model for +correcting the inconsistency between different precipitation products. Our +method demonstrates the effectiveness in downscaling satellite precipitation +estimates from 10 km to 1 km resolution. Extensive experiments conducted in the +Seattle region demonstrate significant improvements in accuracy, bias +reduction, and spatial detail. Importantly, our approach achieves these results +using only precipitation data, showcasing the potential of a purely computer +vision-based approach for enhancing satellite precipitation products and paving +the way for further advancements in this domain. + +
+
+
+
+
+ + ☆ Guided SAM: Label-Efficient Part Segmentation + + +
+ Localizing object parts precisely is essential for tasks such as object +recognition and robotic manipulation. Recent part segmentation methods require +extensive training data and labor-intensive annotations. Segment-Anything Model +(SAM) has demonstrated good performance on a wide range of segmentation +problems, but requires (manual) positional prompts to guide it where to +segment. Furthermore, since it has been trained on full objects instead of +object parts, it is prone to over-segmentation of parts. To address this, we +propose a novel approach that guides SAM towards the relevant object parts. Our +method learns positional prompts from coarse patch annotations that are easier +and cheaper to acquire. We train classifiers on image patches to identify part +classes and aggregate patches into regions of interest (ROIs) with positional +prompts. SAM is conditioned on these ROIs and prompts. This approach, termed +`Guided SAM', enhances efficiency and reduces manual effort, allowing effective +part segmentation with minimal labeled data. We demonstrate the efficacy of +Guided SAM on a dataset of car parts, improving the average IoU on state of the +art models from 0.37 to 0.49 with annotations that are on average five times +more efficient to acquire. + +
+
+
+
+
+ + ☆ Diff-Ensembler: Learning to Ensemble 2D Diffusion Models for + Volume-to-Volume Medical Image Translation + + +
+ Despite success in volume-to-volume translations in medical images, most +existing models struggle to effectively capture the inherent volumetric +distribution using 3D representations. The current state-of-the-art approach +combines multiple 2D-based networks through weighted averaging, thereby +neglecting the 3D spatial structures. Directly training 3D models in medical +imaging presents significant challenges due to high computational demands and +the need for large-scale datasets. To address these challenges, we introduce +Diff-Ensembler, a novel hybrid 2D-3D model for efficient and effective +volumetric translations by ensembling perpendicularly trained 2D diffusion +models with a 3D network in each diffusion step. Moreover, our model can +naturally be used to ensemble diffusion models conditioned on different +modalities, allowing flexible and accurate fusion of input conditions. +Extensive experiments demonstrate that Diff-Ensembler attains superior accuracy +and volumetric realism in 3D medical image super-resolution and modality +translation. We further demonstrate the strength of our model's volumetric +realism using tumor segmentation as a downstream task. + +
+
+
+
+
+ + ☆ OCORD: Open-Campus Object Removal Dataset + + +
+ The rapid advancements in generative models, particularly diffusion-based +techniques, have revolutionized image inpainting tasks by enabling the +generation of high-fidelity and diverse content. However, object removal +remains under-explored as a specific subset of inpainting, facing challenges +such as inadequate semantic understanding and the unintended generation of +artifacts. Existing datasets for object removal often rely on synthetic data, +which fails to align with real-world scenarios, limiting model performance. +Although some real-world datasets address these issues partially, they suffer +from scalability, annotation inefficiencies, and limited realism in physical +phenomena such as lighting and shadows. To address these limitations, this +paper introduces a novel approach to object removal by constructing a +high-resolution real-world dataset through long-duration video capture with +fixed camera settings. Leveraging advanced tools such as Grounding-DINO, +Segment-Anything-Model, and MASA for automated annotation, we provides image, +background, and mask pairs while significantly reducing annotation time and +labor. With our efficient annotation pipeline, we release the first fully open, +high-resolution real-world dataset for object removal, and improved performance +in object removal tasks through fine-tuning of pre-trained diffusion models. + +
+
+ comment: technical report +
+
+
+
+
+ + ☆ Zero-Shot Scene Understanding for Automatic Target Recognition Using + Large Vision-Language Models + + +
+ Automatic target recognition (ATR) plays a critical role in tasks such as +navigation and surveillance, where safety and accuracy are paramount. In +extreme use cases, such as military applications, these factors are often +challenged due to the presence of unknown terrains, environmental conditions, +and novel object categories. Current object detectors, including open-world +detectors, lack the ability to confidently recognize novel objects or operate +in unknown environments, as they have not been exposed to these new conditions. +However, Large Vision-Language Models (LVLMs) exhibit emergent properties that +enable them to recognize objects in varying conditions in a zero-shot manner. +Despite this, LVLMs struggle to localize objects effectively within a scene. To +address these limitations, we propose a novel pipeline that combines the +detection capabilities of open-world detectors with the recognition confidence +of LVLMs, creating a robust system for zero-shot ATR of novel classes and +unknown domains. In this study, we compare the performance of various LVLMs for +recognizing military vehicles, which are often underrepresented in training +datasets. Additionally, we examine the impact of factors such as distance +range, modality, and prompting methods on the recognition performance, +providing insights into the development of more reliable ATR systems for novel +conditions and classes. + +
+
+
+
+
+ + ☆ Kolmogorov-Arnold Network for Remote Sensing Image Semantic Segmentation + + +
+ Semantic segmentation plays a crucial role in remote sensing applications, +where the accurate extraction and representation of features are essential for +high-quality results. Despite the widespread use of encoder-decoder +architectures, existing methods often struggle with fully utilizing the +high-dimensional features extracted by the encoder and efficiently recovering +detailed information during decoding. To address these problems, we propose a +novel semantic segmentation network, namely DeepKANSeg, including two key +innovations based on the emerging Kolmogorov Arnold Network (KAN). Notably, the +advantage of KAN lies in its ability to decompose high-dimensional complex +functions into univariate transformations, enabling efficient and flexible +representation of intricate relationships in data. First, we introduce a +KAN-based deep feature refinement module, namely DeepKAN to effectively capture +complex spatial and rich semantic relationships from high-dimensional features. +Second, we replace the traditional multi-layer perceptron (MLP) layers in the +global-local combined decoder with KAN-based linear layers, namely GLKAN. This +module enhances the decoder's ability to capture fine-grained details during +decoding. To evaluate the effectiveness of the proposed method, experiments are +conducted on two well-known fine-resolution remote sensing benchmark datasets, +namely ISPRS Vaihingen and ISPRS Potsdam. The results demonstrate that the +KAN-enhanced segmentation model achieves superior performance in terms of +accuracy compared to state-of-the-art methods. They highlight the potential of +KANs as a powerful alternative to traditional architectures in semantic +segmentation tasks. Moreover, the explicit univariate decomposition provides +improved interpretability, which is particularly beneficial for applications +requiring explainable learning in remote sensing. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ FedSemiDG: Domain Generalized Federated Semi-supervised Medical Image + Segmentation + + +
+ Medical image segmentation is challenging due to the diversity of medical +images and the lack of labeled data, which motivates recent developments in +federated semi-supervised learning (FSSL) to leverage a large amount of +unlabeled data from multiple centers for model training without sharing raw +data. However, what remains under-explored in FSSL is the domain shift problem +which may cause suboptimal model aggregation and low effectivity of the +utilization of unlabeled data, eventually leading to unsatisfactory performance +in unseen domains. In this paper, we explore this previously ignored scenario, +namely domain generalized federated semi-supervised learning (FedSemiDG), which +aims to learn a model in a distributed manner from multiple domains with +limited labeled data and abundant unlabeled data such that the model can +generalize well to unseen domains. We present a novel framework, Federated +Generalization-Aware SemiSupervised Learning (FGASL), to address the challenges +in FedSemiDG by effectively tackling critical issues at both global and local +levels. Globally, we introduce Generalization-Aware Aggregation (GAA), +assigning adaptive weights to local models based on their generalization +performance. Locally, we use a Dual-Teacher Adaptive Pseudo Label Refinement +(DR) strategy to combine global and domain-specific knowledge, generating more +reliable pseudo labels. Additionally, Perturbation-Invariant Alignment (PIA) +enforces feature consistency under perturbations, promoting domain-invariant +learning. Extensive experiments on three medical segmentation tasks (cardiac +MRI, spine MRI and bladder cancer MRI) demonstrate that our method +significantly outperforms state-of-the-art FSSL and domain generalization +approaches, achieving robust generalization on unseen domains. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ TimberVision: A Multi-Task Dataset and Framework for Log-Component + Segmentation and Tracking in Autonomous Forestry Operations + + +
+ Timber represents an increasingly valuable and versatile resource. However, +forestry operations such as harvesting, handling and measuring logs still +require substantial human labor in remote environments posing significant +safety risks. Progressively automating these tasks has the potential of +increasing their efficiency as well as safety, but requires an accurate +detection of individual logs as well as live trees and their context. Although +initial approaches have been proposed for this challenging application domain, +specialized data and algorithms are still too scarce to develop robust +solutions. To mitigate this gap, we introduce the TimberVision dataset, +consisting of more than 2k annotated RGB images containing a total of 51k trunk +components including cut and lateral surfaces, thereby surpassing any existing +dataset in this domain in terms of both quantity and detail by a large margin. +Based on this data, we conduct a series of ablation experiments for oriented +object detection and instance segmentation and evaluate the influence of +multiple scene parameters on model performance. We introduce a generic +framework to fuse the components detected by our models for both tasks into +unified trunk representations. Furthermore, we automatically derive geometric +properties and apply multi-object tracking to further enhance robustness. Our +detection and tracking approach provides highly descriptive and accurate trunk +representations solely from RGB image data, even under challenging +environmental conditions. Our solution is suitable for a wide range of +application scenarios and can be readily combined with other sensor modalities. + +
+
+ comment: Accepted at Winter Conference on Applications of Computer Vision + (WACV) 2025. Code and dataset available at + https://github.com/timbervision/timbervision +
+
+
+
+
+ + ☆ A method for estimating roadway billboard salience + + +
+ Roadside billboards and other forms of outdoor advertising play a crucial +role in marketing initiatives; however, they can also distract drivers, +potentially contributing to accidents. This study delves into the significance +of roadside advertising in images captured from a driver's perspective. +Firstly, it evaluates the effectiveness of neural networks in detecting +advertising along roads, focusing on the YOLOv5 and Faster R-CNN models. +Secondly, the study addresses the determination of billboard significance using +methods for saliency extraction. The UniSal and SpectralResidual methods were +employed to create saliency maps for each image. The study establishes a +database of eye tracking sessions captured during city highway driving to +assess the saliency models. + +
+
+
+
+
+ + ☆ Anonymization of Documents for Law Enforcement with Machine Learning + + +
+ The steadily increasing utilization of data-driven methods and approaches in +areas that handle sensitive personal information such as in law enforcement +mandates an ever increasing effort in these institutions to comply with data +protection guidelines. In this work, we present a system for automatically +anonymizing images of scanned documents, reducing manual effort while ensuring +data protection compliance. Our method considers the viability of further +forensic processing after anonymization by minimizing automatically redacted +areas by combining automatic detection of sensitive regions with knowledge from +a manually anonymized reference document. Using a self-supervised image model +for instance retrieval of the reference document, our approach requires only +one anonymized example to efficiently redact all documents of the same type, +significantly reducing processing time. We show that our approach outperforms +both a purely automatic redaction system and also a naive copy-paste scheme of +the reference anonymization to other documents on a hand-crafted dataset of +ground truth redactions. + +
+
+ comment: Accepted at IEEE Symposium on CI in Security, Defence and Biometrics + 2025 (IEEE CISDB) +
+
+
+
+
+ + ☆ Localization-Aware Multi-Scale Representation Learning for Repetitive + Action Counting + + +
+ Repetitive action counting (RAC) aims to estimate the number of +class-agnostic action occurrences in a video without exemplars. Most current +RAC methods rely on a raw frame-to-frame similarity representation for period +prediction. However, this approach can be significantly disrupted by common +noise such as action interruptions and inconsistencies, leading to sub-optimal +counting performance in realistic scenarios. In this paper, we introduce a +foreground localization optimization objective into similarity representation +learning to obtain more robust and efficient video features. We propose a +Localization-Aware Multi-Scale Representation Learning (LMRL) framework. +Specifically, we apply a Multi-Scale Period-Aware Representation (MPR) with a +scale-specific design to accommodate various action frequencies and learn more +flexible temporal correlations. Furthermore, we introduce the Repetition +Foreground Localization (RFL) method, which enhances the representation by +coarsely identifying periodic actions and incorporating global semantic +information. These two modules can be jointly optimized, resulting in a more +discerning periodic action representation. Our approach significantly reduces +the impact of noise, thereby improving counting accuracy. Additionally, the +framework is designed to be scalable and adaptable to different types of video +content. Experimental results on the RepCountA and UCFRep datasets demonstrate +that our proposed method effectively handles repetitive action counting. + +
+
+ comment: Accepted by IEEE VCIP2024 +
+
+
+
+
+ + ☆ The Devil is in the Spurious Correlation: Boosting Moment Retrieval via + Temporal Dynamic Learning + + +
+ Given a textual query along with a corresponding video, the objective of +moment retrieval aims to localize the moments relevant to the query within the +video. While commendable results have been demonstrated by existing +transformer-based approaches, predicting the accurate temporal span of the +target moment is currently still a major challenge. In this paper, we reveal +that a crucial reason stems from the spurious correlation between the text +queries and the moment context. Namely, the model may associate the textual +query with the background frames rather than the target moment. To address this +issue, we propose a temporal dynamic learning approach for moment retrieval, +where two strategies are designed to mitigate the spurious correlation. First, +we introduce a novel video synthesis approach to construct a dynamic context +for the relevant moment. With separate yet similar videos mixed up, the +synthesis approach empowers our model to attend to the target moment of the +corresponding query under various dynamic contexts. Second, we enhance the +representation by learning temporal dynamics. Besides the visual +representation, text queries are aligned with temporal dynamic representations, +which enables our model to establish a non-spurious correlation between the +query-related moment and context. With the aforementioned proposed method, the +spurious correlation issue in moment retrieval can be largely alleviated. Our +method establishes a new state-of-the-art performance on two popular benchmarks +of moment retrieval, \ie, QVHighlights and Charades-STA. In addition, the +detailed ablation analyses demonstrate the effectiveness of the proposed +strategies. Our code will be publicly available. + +
+
+
+
+
+ + ☆ Code and Pixels: Multi-Modal Contrastive Pre-training for Enhanced + Tabular Data Analysis + + +
+ Learning from tabular data is of paramount importance, as it complements the +conventional analysis of image and video data by providing a rich source of +structured information that is often critical for comprehensive understanding +and decision-making processes. We present Multi-task Contrastive Masked Tabular +Modeling (MT-CMTM), a novel method aiming to enhance tabular models by +leveraging the correlation between tabular data and corresponding images. +MT-CMTM employs a dual strategy combining contrastive learning with masked +tabular modeling, optimizing the synergy between these data modalities. + Central to our approach is a 1D Convolutional Neural Network with residual +connections and an attention mechanism (1D-ResNet-CBAM), designed to +efficiently process tabular data without relying on images. This enables +MT-CMTM to handle purely tabular data for downstream tasks, eliminating the +need for potentially costly image acquisition and processing. + We evaluated MT-CMTM on the DVM car dataset, which is uniquely suited for +this particular scenario, and the newly developed HIPMP dataset, which connects +membrane fabrication parameters with image data. Our MT-CMTM model outperforms +the proposed tabular 1D-ResNet-CBAM, which is trained from scratch, achieving a +relative 1.48% improvement in relative MSE on HIPMP and a 2.38% increase in +absolute accuracy on DVM. These results demonstrate MT-CMTM's robustness and +its potential to advance the field of multi-modal learning. + +
+
+
+
+
+ + ☆ Comparative analysis of optical character recognition methods for Sámi + texts from the National Library of Norway + + +
+ Optical Character Recognition (OCR) is crucial to the National Library of +Norway's (NLN) digitisation process as it converts scanned documents into +machine-readable text. However, for the S\'ami documents in NLN's collection, +the OCR accuracy is insufficient. Given that OCR quality affects downstream +processes, evaluating and improving OCR for text written in S\'ami languages is +necessary to make these resources accessible. To address this need, this work +fine-tunes and evaluates three established OCR approaches, Transkribus, +Tesseract and TrOCR, for transcribing S\'ami texts from NLN's collection. Our +results show that Transkribus and TrOCR outperform Tesseract on this task, +while Tesseract achieves superior performance on an out-of-domain dataset. +Furthermore, we show that fine-tuning pre-trained models and supplementing +manual annotations with machine annotations and synthetic text images can yield +accurate OCR for S\'ami languages, even with a moderate amount of manually +annotated data. + +
+
+ comment: To be published in Proceedings of the 25th Nordic Conference on + Computational Linguistics (NoDaLiDa) +
+
+
+
+
+ + ☆ Toward Realistic Camouflaged Object Detection: Benchmarks and Method + + +
+ Camouflaged object detection (COD) primarily relies on semantic or instance +segmentation methods. While these methods have made significant advancements in +identifying the contours of camouflaged objects, they may be inefficient or +cost-effective for tasks that only require the specific location of the object. +Object detection algorithms offer an optimized solution for Realistic +Camouflaged Object Detection (RCOD) in such cases. However, detecting +camouflaged objects remains a formidable challenge due to the high degree of +similarity between the features of the objects and their backgrounds. Unlike +segmentation methods that perform pixel-wise comparisons to differentiate +between foreground and background, object detectors omit this analysis, further +aggravating the challenge. To solve this problem, we propose a camouflage-aware +feature refinement (CAFR) strategy. Since camouflaged objects are not rare +categories, CAFR fully utilizes a clear perception of the current object within +the prior knowledge of large models to assist detectors in deeply understanding +the distinctions between background and foreground. Specifically, in CAFR, we +introduce the Adaptive Gradient Propagation (AGP) module that fine-tunes all +feature extractor layers in large detection models to fully refine +class-specific features from camouflaged contexts. We then design the Sparse +Feature Refinement (SFR) module that optimizes the transformer-based feature +extractor to focus primarily on capturing class-specific features in +camouflaged scenarios. To facilitate the assessment of RCOD tasks, we manually +annotate the labels required for detection on three existing segmentation COD +datasets, creating a new benchmark for RCOD tasks. Code and datasets are +available at: https://github.com/zhimengXin/RCOD. + +
+
+
+
+
+ + ☆ Event-based Video Person Re-identification via Cross-Modality and + Temporal Collaboration + + +
+ Video-based person re-identification (ReID) has become increasingly important +due to its applications in video surveillance applications. By employing events +in video-based person ReID, more motion information can be provided between +continuous frames to improve recognition accuracy. Previous approaches have +assisted by introducing event data into the video person ReID task, but they +still cannot avoid the privacy leakage problem caused by RGB images. In order +to avoid privacy attacks and to take advantage of the benefits of event data, +we consider using only event data. To make full use of the information in the +event stream, we propose a Cross-Modality and Temporal Collaboration (CMTC) +network for event-based video person ReID. First, we design an event transform +network to obtain corresponding auxiliary information from the input of raw +events. Additionally, we propose a differential modality collaboration module +to balance the roles of events and auxiliaries to achieve complementary +effects. Furthermore, we introduce a temporal collaboration module to exploit +motion information and appearance cues. Experimental results demonstrate that +our method outperforms others in the task of event-based video person ReID. + +
+
+ comment: Accepted by ICASSP 2025 +
+
+
+
+
+ + ☆ Skip Mamba Diffusion for Monocular 3D Semantic Scene Completion + + +
+ 3D semantic scene completion is critical for multiple downstream tasks in +autonomous systems. It estimates missing geometric and semantic information in +the acquired scene data. Due to the challenging real-world conditions, this +task usually demands complex models that process multi-modal data to achieve +acceptable performance. We propose a unique neural model, leveraging advances +from the state space and diffusion generative modeling to achieve remarkable 3D +semantic scene completion performance with monocular image input. Our technique +processes the data in the conditioned latent space of a variational autoencoder +where diffusion modeling is carried out with an innovative state space +technique. A key component of our neural network is the proposed Skimba (Skip +Mamba) denoiser, which is adept at efficiently processing long-sequence data. +The Skimba diffusion model is integral to our 3D scene completion network, +incorporating a triple Mamba structure, dimensional decomposition residuals and +varying dilations along three directions. We also adopt a variant of this +network for the subsequent semantic segmentation stage of our method. Extensive +evaluation on the standard SemanticKITTI and SSCBench-KITTI360 datasets show +that our approach not only outperforms other monocular techniques by a large +margin, it also achieves competitive performance against stereo methods. The +code is available at https://github.com/xrkong/skimba + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ☆ EdgeTAM: On-Device Track Anything Model + + +
+ On top of Segment Anything Model (SAM), SAM 2 further extends its capability +from image to video inputs through a memory bank mechanism and obtains a +remarkable performance compared with previous methods, making it a foundation +model for video segmentation task. In this paper, we aim at making SAM 2 much +more efficient so that it even runs on mobile devices while maintaining a +comparable performance. Despite several works optimizing SAM for better +efficiency, we find they are not sufficient for SAM 2 because they all focus on +compressing the image encoder, while our benchmark shows that the newly +introduced memory attention blocks are also the latency bottleneck. Given this +observation, we propose EdgeTAM, which leverages a novel 2D Spatial Perceiver +to reduce the computational cost. In particular, the proposed 2D Spatial +Perceiver encodes the densely stored frame-level memories with a lightweight +Transformer that contains a fixed set of learnable queries. Given that video +segmentation is a dense prediction task, we find preserving the spatial +structure of the memories is essential so that the queries are split into +global-level and patch-level groups. We also propose a distillation pipeline +that further improves the performance without inference overhead. As a result, +EdgeTAM achieves 87.7, 70.0, 72.3, and 71.7 J&F on DAVIS 2017, MOSE, SA-V val, +and SA-V test, while running at 16 FPS on iPhone 15 Pro Max. + +
+
+ comment: Code will be released at https://github.com/facebookresearch/EdgeTAM +
+
+
+
+
+ + MOS-Attack: A Scalable Multi-objective Adversarial Attack Framework CVPR 2025 + + +
+ Crafting adversarial examples is crucial for evaluating and enhancing the +robustness of Deep Neural Networks (DNNs), presenting a challenge equivalent to +maximizing a non-differentiable 0-1 loss function. + However, existing single objective methods, namely adversarial attacks focus +on a surrogate loss function, do not fully harness the benefits of engaging +multiple loss functions, as a result of insufficient understanding of their +synergistic and conflicting nature. + To overcome these limitations, we propose the Multi-Objective Set-based +Attack (MOS Attack), a novel adversarial attack framework leveraging multiple +loss functions and automatically uncovering their interrelations. + The MOS Attack adopts a set-based multi-objective optimization strategy, +enabling the incorporation of numerous loss functions without additional +parameters. + It also automatically mines synergistic patterns among various losses, +facilitating the generation of potent adversarial attacks with fewer +objectives. + Extensive experiments have shown that our MOS Attack outperforms +single-objective attacks. Furthermore, by harnessing the identified synergistic +patterns, MOS Attack continues to show superior results with a reduced number +of loss functions. + +
+
+ comment: Under Review of CVPR 2025 +
+
+
+
+
+ + ☆ Implicit Neural Representations for Registration of Left Ventricle + Myocardium During a Cardiac Cycle + + +
+ Understanding the movement of the left ventricle myocardium (LVmyo) during +the cardiac cycle is essential for assessing cardiac function. One way to model +this movement is through a series of deformable image registrations (DIRs) of +the LVmyo. Traditional deep learning methods for DIRs, such as those based on +convolutional neural networks, often require substantial memory and +computational resources. In contrast, implicit neural representations (INRs) +offer an efficient approach by operating on any number of continuous points. +This study extends the use of INRs for DIR to cardiac computed tomography (CT), +focusing on LVmyo registration. To enhance the precision of the registration +around the LVmyo, we incorporate the signed distance field of the LVmyo with +the Hounsfield Unit values from the CT frames. This guides the registration of +the LVmyo, while keeping the tissue information from the CT frames. Our +framework demonstrates high registration accuracy and provides a robust method +for temporal registration that facilitates further analysis of LVmyo motion. + +
+
+ comment: 9 pages, 5 figures, STACOM 2024 +
+
+
+
+
+ + ☆ Depth and Image Fusion for Road Obstacle Detection Using Stereo Camera + + +
+ This paper is devoted to the detection of objects on a road, performed with a +combination of two methods based on both the use of depth information and video +analysis of data from a stereo camera. Since neither the time of the appearance +of an object on the road, nor its size and shape is known in advance, +ML/DL-based approaches are not applicable. The task becomes more complicated +due to variations in artificial illumination, inhomogeneous road surface +texture, and unknown character and features of the object. To solve this +problem we developed the depth and image fusion method that complements a +search of small contrast objects by RGB-based method, and obstacle detection by +stereo image-based approach with SLIC superpixel segmentation. We conducted +experiments with static and low speed obstacles in an underground parking lot +and demonstrated the successful work of the developed technique for detecting +and even tracking small objects, which can be parking infrastructure objects, +things left on the road, wheels, dropped boxes, etc. + +
+
+ comment: 8 pages, 15 figures +
+
+
+
+
+ + ☆ Can Vision-Language Models Evaluate Handwritten Math? + + +
+ Recent advancements in Vision-Language Models (VLMs) have opened new +possibilities in automatic grading of handwritten student responses, +particularly in mathematics. However, a comprehensive study to test the ability +of VLMs to evaluate and reason over handwritten content remains absent. To +address this gap, we introduce FERMAT, a benchmark designed to assess the +ability of VLMs to detect, localize and correct errors in handwritten +mathematical content. FERMAT spans four key error dimensions - computational, +conceptual, notational, and presentation - and comprises over 2,200 handwritten +math solutions derived from 609 manually curated problems from grades 7-12 with +intentionally introduced perturbations. Using FERMAT we benchmark nine VLMs +across three tasks: error detection, localization, and correction. Our results +reveal significant shortcomings in current VLMs in reasoning over handwritten +text, with Gemini-1.5-Pro achieving the highest error correction rate (77%). We +also observed that some models struggle with processing handwritten content, as +their accuracy improves when handwritten inputs are replaced with printed text +or images. These findings highlight the limitations of current VLMs and reveal +new avenues for improvement. We release FERMAT and all the associated resources +in the open-source to drive further research. + +
+
+
+
+
+ + ☆ CSTA: Spatial-Temporal Causal Adaptive Learning for Exemplar-Free Video + Class-Incremental Learning + + +
+ Continual learning aims to acquire new knowledge while retaining past +information. Class-incremental learning (CIL) presents a challenging scenario +where classes are introduced sequentially. For video data, the task becomes +more complex than image data because it requires learning and preserving both +spatial appearance and temporal action involvement. To address this challenge, +we propose a novel exemplar-free framework that equips separate spatiotemporal +adapters to learn new class patterns, accommodating the incremental information +representation requirements unique to each class. While separate adapters are +proven to mitigate forgetting and fit unique requirements, naively applying +them hinders the intrinsic connection between spatial and temporal information +increments, affecting the efficiency of representing newly learned class +information. Motivated by this, we introduce two key innovations from a causal +perspective. First, a causal distillation module is devised to maintain the +relation between spatial-temporal knowledge for a more efficient +representation. Second, a causal compensation mechanism is proposed to reduce +the conflicts during increment and memorization between different types of +information. Extensive experiments conducted on benchmark datasets demonstrate +that our framework can achieve new state-of-the-art results, surpassing current +example-based methods by 4.2% in accuracy on average. + +
+
+ comment: IEEE TCSVT Submission +
+
+
+
+
+ + ☆ MECD+: Unlocking Event-Level Causal Graph Discovery for Video Reasoning TPAMI + + +
+ Video causal reasoning aims to achieve a high-level understanding of videos +from a causal perspective. However, it exhibits limitations in its scope, +primarily executed in a question-answering paradigm and focusing on brief video +segments containing isolated events and basic causal relations, lacking +comprehensive and structured causality analysis for videos with multiple +interconnected events. To fill this gap, we introduce a new task and dataset, +Multi-Event Causal Discovery (MECD). It aims to uncover the causal relations +between events distributed chronologically across long videos. Given visual +segments and textual descriptions of events, MECD identifies the causal +associations between these events to derive a comprehensive and structured +event-level video causal graph explaining why and how the result event +occurred. To address the challenges of MECD, we devise a novel framework +inspired by the Granger Causality method, incorporating an efficient mask-based +event prediction model to perform an Event Granger Test. It estimates causality +by comparing the predicted result event when premise events are masked versus +unmasked. Furthermore, we integrate causal inference techniques such as +front-door adjustment and counterfactual inference to mitigate challenges in +MECD like causality confounding and illusory causality. Additionally, context +chain reasoning is introduced to conduct more robust and generalized reasoning. +Experiments validate the effectiveness of our framework in reasoning complete +causal relations, outperforming GPT-4o and VideoChat2 by 5.77% and 2.70%, +respectively. Further experiments demonstrate that causal relation graphs can +also contribute to downstream video understanding tasks such as video question +answering and video event prediction. + +
+
+ comment: IEEE TPAMI Submission. arXiv admin note: substantial text overlap + with arXiv:2409.17647 +
+
+
+
+
+ + ☆ Exploring the Use of Contrastive Language-Image Pre-Training for Human + Posture Classification: Insights from Yoga Pose Analysis + + +
+ Accurate human posture classification in images and videos is crucial for +automated applications across various fields, including work safety, physical +rehabilitation, sports training, or daily assisted living. Recently, multimodal +learning methods, such as Contrastive Language-Image Pretraining (CLIP), have +advanced significantly in jointly understanding images and text. This study +aims to assess the effectiveness of CLIP in classifying human postures, +focusing on its application in yoga. Despite the initial limitations of the +zero-shot approach, applying transfer learning on 15,301 images (real and +synthetic) with 82 classes has shown promising results. The article describes +the full procedure for fine-tuning, including the choice for image description +syntax, models and hyperparameters adjustment. The fine-tuned CLIP model, +tested on 3826 images, achieves an accuracy of over 85%, surpassing the current +state-of-the-art of previous works on the same dataset by approximately 6%, its +training time being 3.5 times lower than what is needed to fine-tune a +YOLOv8-based model. For more application-oriented scenarios, with smaller +datasets of six postures each, containing 1301 and 401 training images, the +fine-tuned models attain an accuracy of 98.8% and 99.1%, respectively. +Furthermore, our experiments indicate that training with as few as 20 images +per pose can yield around 90% accuracy in a six-class dataset. This study +demonstrates that this multimodal technique can be effectively used for yoga +pose classification, and possibly for human posture classification, in general. +Additionally, CLIP inference time (around 7 ms) supports that the model can be +integrated into automated systems for posture evaluation, e.g., for developing +a real-time personal yoga assistant for performance assessment. + +
+
+
+
+
+ + ☆ TimeLogic: A Temporal Logic Benchmark for Video QA + + +
+ Temporal logical understanding, a core facet of human cognition, plays a +pivotal role in capturing complex sequential events and their temporal +relationships within videos. This capability is particularly crucial in tasks +like Video Question Answering (VideoQA), where the goal is to process visual +data over time together with textual data to provide coherent answers. However, +current VideoQA benchmarks devote little focus to evaluating this critical +skill due to the challenge of annotating temporal logic. Despite the +advancement of vision-language models, assessing their temporal logical +reasoning powers remains a challenge, primarily due to the lack QA pairs that +demand formal, complex temporal reasoning. To bridge this gap, we introduce the +TimeLogic QA (TLQA) framework to automatically generate the QA pairs, +specifically designed to evaluate the temporal logical understanding. To this +end, TLQA leverages temporal annotations from existing video datasets together +with temporal operators derived from logic theory to construct questions that +test understanding of event sequences and their temporal relationships. TLQA +framework is generic and scalable, capable of leveraging both, existing video +action datasets with temporal action segmentation annotations, or video +datasets with temporal scene graph annotations, to automatically generate +temporal logical questions. We leverage 4 datasets, STAR, Breakfast, AGQA, and +CrossTask, and generate two VideoQA dataset variants - small (TLQA-S) and large +(TLQA-L) - containing 2k and 10k QA pairs for each category, resulting in 32k +and 160k total pairs per dataset. We undertake a comprehensive evaluation of +leading-edge VideoQA models, employing the TLQA to benchmark their temporal +logical understanding capabilities. We assess the VideoQA model's temporal +reasoning performance on 16 categories of temporal logic with varying temporal +complexity. + +
+
+
+
+
+ + ☆ Multi-face emotion detection for effective Human-Robot Interaction + + +
+ The integration of dialogue interfaces in mobile devices has become +ubiquitous, providing a wide array of services. As technology progresses, +humanoid robots designed with human-like features to interact effectively with +people are gaining prominence, and the use of advanced human-robot dialogue +interfaces is continually expanding. In this context, emotion recognition plays +a crucial role in enhancing human-robot interaction by enabling robots to +understand human intentions. This research proposes a facial emotion detection +interface integrated into a mobile humanoid robot, capable of displaying +real-time emotions from multiple individuals on a user interface. To this end, +various deep neural network models for facial expression recognition were +developed and evaluated under consistent computer-based conditions, yielding +promising results. Afterwards, a trade-off between accuracy and memory +footprint was carefully considered to effectively implement this application on +a mobile humanoid robot. + +
+
+ comment: 9 pages, 8 figures and 1 table. Accepted at the 17th International + Conference on Agents and Artificial Intelligence (ICAART 2025), Porto, + Portugal +
+
+
+
+
+ + ☆ FaceOracle: Chat with a Face Image Oracle + + +
+ A face image is a mandatory part of ID and travel documents. Obtaining +high-quality face images when issuing such documents is crucial for both human +examiners and automated face recognition systems. In several international +standards, face image quality requirements are intricate and defined in detail. +Identifying and understanding non-compliance or defects in the submitted face +images is crucial for both issuing authorities and applicants. In this work, we +introduce FaceOracle, an LLM-powered AI assistant that helps its users analyze +a face image in a natural conversational manner using standard compliant +algorithms. Leveraging the power of LLMs, users can get explanations of various +face image quality concepts as well as interpret the outcome of face image +quality assessment (FIQA) algorithms. We implement a proof-of-concept that +demonstrates how experts at an issuing authority could integrate FaceOracle +into their workflow to analyze, understand, and communicate their decisions +more efficiently, resulting in enhanced productivity. + +
+
+
+
+
+ + ☆ Lung Cancer detection using Deep Learning + + +
+ In this paper we discuss lung cancer detection using hybrid model of +Convolutional-Neural-Networks (CNNs) and Support-Vector-Machines-(SVMs) in +order to gain early detection of tumors, benign or malignant. The work uses +this hybrid model by training upon the Computed Tomography scans (CT scans) as +dataset. Using deep learning for detecting lung cancer early is a cutting-edge +method. + +
+
+
+
+
+ + ☆ VAGeo: View-specific Attention for Cross-View Object Geo-Localization + + +
+ Cross-view object geo-localization (CVOGL) aims to locate an object of +interest in a captured ground- or drone-view image within the satellite image. +However, existing works treat ground-view and drone-view query images +equivalently, overlooking their inherent viewpoint discrepancies and the +spatial correlation between the query image and the satellite-view reference +image. To this end, this paper proposes a novel View-specific Attention +Geo-localization method (VAGeo) for accurate CVOGL. Specifically, VAGeo +contains two key modules: view-specific positional encoding (VSPE) module and +channel-spatial hybrid attention (CSHA) module. In object-level, according to +the characteristics of different viewpoints of ground and drone query images, +viewpoint-specific positional codings are designed to more accurately identify +the click-point object of the query image in the VSPE module. In feature-level, +a hybrid attention in the CSHA module is introduced by combining channel +attention and spatial attention mechanisms simultaneously for learning +discriminative features. Extensive experimental results demonstrate that the +proposed VAGeo gains a significant performance improvement, i.e., improving +acc@0.25/acc@0.5 on the CVOGL dataset from 45.43%/42.24% to 48.21%/45.22% for +ground-view, and from 61.97%/57.66% to 66.19%/61.87% for drone-view. + +
+
+ comment: Accepted by ICASSP 2025 +
+
+
+
+
+ + ☆ A4O: All Trigger for One sample + + +
+ Backdoor attacks have become a critical threat to deep neural networks +(DNNs), drawing many research interests. However, most of the studied attacks +employ a single type of trigger. Consequently, proposed backdoor defenders +often rely on the assumption that triggers would appear in a unified way. In +this paper, we show that this naive assumption can create a loophole, allowing +more sophisticated backdoor attacks to bypass. We design a novel backdoor +attack mechanism that incorporates multiple types of backdoor triggers, +focusing on stealthiness and effectiveness. Our journey begins with the +intriguing observation that the performance of a backdoor attack in deep +learning models, as well as its detectability and removability, are all +proportional to the magnitude of the trigger. Based on this correlation, we +propose reducing the magnitude of each trigger type and combining them to +achieve a strong backdoor relying on the combined trigger while still staying +safely under the radar of defenders. Extensive experiments on three standard +datasets demonstrate that our method can achieve high attack success rates +(ASRs) while consistently bypassing state-of-the-art defenses. + +
+
+
+
+
+ + ☆ Uncertainty Guarantees on Automated Precision Weeding using Conformal + Prediction + + +
+ Precision agriculture in general, and precision weeding in particular, have +greatly benefited from the major advancements in deep learning and computer +vision. A large variety of commercial robotic solutions are already available +and deployed. However, the adoption by farmers of such solutions is still low +for many reasons, an important one being the lack of trust in these systems. +This is in great part due to the opaqueness and complexity of deep neural +networks and the manufacturers' inability to provide valid guarantees on their +performance. Conformal prediction, a well-established methodology in the +machine learning community, is an efficient and reliable strategy for providing +trustworthy guarantees on the predictions of any black-box model under very +minimal constraints. Bridging the gap between the safe machine learning and +precision agriculture communities, this article showcases conformal prediction +in action on the task of precision weeding through deep learning-based image +classification. After a detailed presentation of the conformal prediction +methodology and the development of a precision spraying pipeline based on a +''conformalized'' neural network and well-defined spraying decision rules, the +article evaluates this pipeline on two real-world scenarios: one under +in-distribution conditions, the other reflecting a near out-of-distribution +setting. The results show that we are able to provide formal, i.e. certifiable, +guarantees on spraying at least 90% of the weeds. + +
+
+
+
+
+ + ☆ Radial Distortion in Face Images: Detection and Impact + + +
+ Acquiring face images of sufficiently high quality is important for online ID +and travel document issuance applications using face recognition systems (FRS). +Low-quality, manipulated (intentionally or unintentionally), or distorted +images degrade the FRS performance and facilitate documents' misuse. Securing +quality for enrolment images, especially in the unsupervised self-enrolment +scenario via a smartphone, becomes important to assure FRS performance. In this +work, we focus on the less studied area of radial distortion (a.k.a., the +fish-eye effect) in face images and its impact on FRS performance. We introduce +an effective radial distortion detection model that can detect and flag radial +distortion in the enrolment scenario. We formalize the detection model as a +face image quality assessment (FIQA) algorithm and provide a careful inspection +of the effect of radial distortion on FRS performance. Evaluation results show +excellent detection results for the proposed models, and the study on the +impact on FRS uncovers valuable insights into how to best use these models in +operational systems. + +
+
+
+
+
+ + ☆ BIOMEDICA: An Open Biomedical Image-Caption Archive, Dataset, and + Vision-Language Models Derived from Scientific Literature + + +
+ The development of vision-language models (VLMs) is driven by large-scale and +diverse multimodal datasets. However, progress toward generalist biomedical +VLMs is limited by the lack of annotated, publicly accessible datasets across +biology and medicine. Existing efforts are restricted to narrow domains, +missing the full diversity of biomedical knowledge encoded in scientific +literature. To address this gap, we introduce BIOMEDICA, a scalable, +open-source framework to extract, annotate, and serialize the entirety of the +PubMed Central Open Access subset into an easy-to-use, publicly accessible +dataset.Our framework produces a comprehensive archive with over 24 million +unique image-text pairs from over 6 million articles. Metadata and +expert-guided annotations are also provided. We demonstrate the utility and +accessibility of our resource by releasing BMCA-CLIP, a suite of CLIP-style +models continuously pre-trained on the BIOMEDICA dataset via streaming, +eliminating the need to download 27 TB of data locally.On average, our models +achieve state-of-the-art performance across 40 tasks - spanning pathology, +radiology, ophthalmology, dermatology, surgery, molecular biology, +parasitology, and cell biology - excelling in zero-shot classification with a +6.56% average improvement (as high as 29.8% and 17.5% in dermatology and +ophthalmology, respectively), and stronger image-text retrieval, all while +using 10x less compute. To foster reproducibility and collaboration, we release +our codebase and dataset for the broader research community. + +
+
+
+
+
+ + ☆ Adaptive Noise-Tolerant Network for Image Segmentation + + +
+ Unlike image classification and annotation, for which deep network models +have achieved dominating superior performances compared to traditional computer +vision algorithms, deep learning for automatic image segmentation still faces +critical challenges. One of such hurdles is to obtain ground-truth +segmentations as the training labels for deep network training. Especially when +we study biomedical images, such as histopathological images (histo-images), it +is unrealistic to ask for manual segmentation labels as the ground truth for +training due to the fine image resolution as well as the large image size and +complexity. In this paper, instead of relying on clean segmentation labels, we +study whether and how integrating imperfect or noisy segmentation results from +off-the-shelf segmentation algorithms may help achieve better segmentation +results through a new Adaptive Noise-Tolerant Network (ANTN) model. We extend +the noisy label deep learning to image segmentation with two novel aspects: (1) +multiple noisy labels can be integrated into one deep learning model; (2) noisy +segmentation modeling, including probabilistic parameters, is adaptive, +depending on the given testing image appearance. Implementation of the new ANTN +model on both the synthetic data and real-world histo-images demonstrates its +effectiveness and superiority over off-the-shelf and other existing +deep-learning-based image segmentation algorithms. + +
+
+
+
+
+ + ☆ Eye Sclera for Fair Face Image Quality Assessment + + +
+ Fair operational systems are crucial in gaining and maintaining society's +trust in face recognition systems (FRS). FRS start with capturing an image and +assessing its quality before using it further for enrollment or verification. +Fair Face Image Quality Assessment (FIQA) schemes therefore become equally +important in the context of fair FRS. This work examines the sclera as a +quality assessment region for obtaining a fair FIQA. The sclera region is +agnostic to demographic variations and skin colour for assessing the quality of +a face image. We analyze three skin tone related ISO/IEC face image quality +assessment measures and assess the sclera region as an alternative area for +assessing FIQ. Our analysis of the face dataset of individuals from different +demographic groups representing different skin tones indicates sclera as an +alternative to measure dynamic range, over- and under-exposure of face using +sclera region alone. The sclera region being agnostic to skin tone, i.e., +demographic factors, provides equal utility as a fair FIQA as shown by our +Error-vs-Discard Characteristic (EDC) curve analysis. + +
+
+
+
+
+ + ☆ Robust Single Object Tracking in LiDAR Point Clouds under Adverse + Weather Conditions + + +
+ 3D single object tracking (3DSOT) in LiDAR point clouds is a critical task +for outdoor perception, enabling real-time perception of object location, +orientation, and motion. Despite the impressive performance of current 3DSOT +methods, evaluating them on clean datasets inadequately reflects their +comprehensive performance, as the adverse weather conditions in real-world +surroundings has not been considered. One of the main obstacles is the lack of +adverse weather benchmarks for the evaluation of 3DSOT. To this end, this work +proposes a challenging benchmark for LiDAR-based 3DSOT in adverse weather, +which comprises two synthetic datasets (KITTI-A and nuScenes-A) and one +real-world dataset (CADC-SOT) spanning three weather types: rain, fog, and +snow. Based on this benchmark, five representative 3D trackers from different +tracking frameworks conducted robustness evaluation, resulting in significant +performance degradations. This prompts the question: What are the factors that +cause current advanced methods to fail on such adverse weather samples? +Consequently, we explore the impacts of adverse weather and answer the above +question from three perspectives: 1) target distance; 2) template shape +corruption; and 3) target shape corruption. Finally, based on domain +randomization and contrastive learning, we designed a dual-branch tracking +framework for adverse weather, named DRCT, achieving excellent performance in +benchmarks. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ MSV-Mamba: A Multiscale Vision Mamba Network for Echocardiography + Segmentation + + +
+ Ultrasound imaging frequently encounters challenges, such as those related to +elevated noise levels, diminished spatiotemporal resolution, and the complexity +of anatomical structures. These factors significantly hinder the model's +ability to accurately capture and analyze structural relationships and dynamic +patterns across various regions of the heart. Mamba, an emerging model, is one +of the most cutting-edge approaches that is widely applied to diverse vision +and language tasks. To this end, this paper introduces a U-shaped deep learning +model incorporating a large-window Mamba scale (LMS) module and a hierarchical +feature fusion approach for echocardiographic segmentation. First, a cascaded +residual block serves as an encoder and is employed to incrementally extract +multiscale detailed features. Second, a large-window multiscale mamba module is +integrated into the decoder to capture global dependencies across regions and +enhance the segmentation capability for complex anatomical structures. +Furthermore, our model introduces auxiliary losses at each decoder layer and +employs a dual attention mechanism to fuse multilayer features both spatially +and across channels. This approach enhances segmentation performance and +accuracy in delineating complex anatomical structures. Finally, the +experimental results using the EchoNet-Dynamic and CAMUS datasets demonstrate +that the model outperforms other methods in terms of both accuracy and +robustness. For the segmentation of the left ventricular endocardium +(${LV}_{endo}$), the model achieved optimal values of 95.01 and 93.36, +respectively, while for the left ventricular epicardium (${LV}_{epi}$), values +of 87.35 and 87.80, respectively, were achieved. This represents an improvement +ranging between 0.54 and 1.11 compared with the best-performing model. + +
+
+
+
+
+ + ☆ Duplex: Dual Prototype Learning for Compositional Zero-Shot Learning + + +
+ Compositional Zero-Shot Learning (CZSL) aims to enable models to recognize +novel compositions of visual states and objects that were absent during +training. Existing methods predominantly focus on learning semantic +representations of seen compositions but often fail to disentangle the +independent features of states and objects in images, thereby limiting their +ability to generalize to unseen compositions. To address this challenge, we +propose Duplex, a novel dual-prototype learning method that integrates semantic +and visual prototypes through a carefully designed dual-branch architecture, +enabling effective representation learning for compositional tasks. Duplex +utilizes a Graph Neural Network (GNN) to adaptively update visual prototypes, +capturing complex interactions between states and objects. Additionally, it +leverages the strong visual-semantic alignment of pre-trained Vision-Language +Models (VLMs) and employs a multi-path architecture combined with prompt +engineering to align image and text representations, ensuring robust +generalization. Extensive experiments on three benchmark datasets demonstrate +that Duplex outperforms state-of-the-art methods in both closed-world and +open-world settings. + +
+
+
+
+
+ + ☆ Matching Free Depth Recovery from Structured Light + + +
+ We present a novel approach for depth estimation from images captured by +structured light systems. Unlike many previous methods that rely on image +matching process, our approach uses a density voxel grid to represent scene +geometry, which is trained via self-supervised differentiable volume rendering. +Our method leverages color fields derived from projected patterns in structured +light systems during the rendering process, enabling the isolated optimization +of the geometry field. This contributes to faster convergence and high-quality +output. Additionally, we incorporate normalized device coordinates (NDC), a +distortion loss, and a novel surface-based color loss to enhance geometric +fidelity. Experimental results demonstrate that our method outperforms existing +matching-based techniques in geometric performance for few-shot scenarios, +achieving approximately a 60% reduction in average estimated depth errors on +synthetic scenes and about 30% on real-world captured scenes. Furthermore, our +approach delivers fast training, with a speed roughly three times faster than +previous matching-free methods that employ implicit representations. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ Dynamic Multimodal Fusion via Meta-Learning Towards Micro-Video + Recommendation + + +
+ Multimodal information (e.g., visual, acoustic, and textual) has been widely +used to enhance representation learning for micro-video recommendation. For +integrating multimodal information into a joint representation of micro-video, +multimodal fusion plays a vital role in the existing micro-video recommendation +approaches. However, the static multimodal fusion used in previous studies is +insufficient to model the various relationships among multimodal information of +different micro-videos. In this paper, we develop a novel meta-learning-based +multimodal fusion framework called Meta Multimodal Fusion (MetaMMF), which +dynamically assigns parameters to the multimodal fusion function for each +micro-video during its representation learning. Specifically, MetaMMF regards +the multimodal fusion of each micro-video as an independent task. Based on the +meta information extracted from the multimodal features of the input task, +MetaMMF parameterizes a neural network as the item-specific fusion function via +a meta learner. We perform extensive experiments on three benchmark datasets, +demonstrating the significant improvements over several state-of-the-art +multimodal recommendation models, like MMGCN, LATTICE, and InvRL. Furthermore, +we lighten our model by adopting canonical polyadic decomposition to improve +the training efficiency, and validate its effectiveness through experimental +results. Codes are available at https://github.com/hanliu95/MetaMMF. + +
+
+ comment: This paper has been accepted by ACM Transactions on Information + Systems +
+
+
+
+
+ + ☆ The Quest for Visual Understanding: A Journey Through the Evolution of + Visual Question Answering + + +
+ Visual Question Answering (VQA) is an interdisciplinary field that bridges +the gap between computer vision (CV) and natural language processing(NLP), +enabling Artificial Intelligence(AI) systems to answer questions about images. +Since its inception in 2015, VQA has rapidly evolved, driven by advances in +deep learning, attention mechanisms, and transformer-based models. This survey +traces the journey of VQA from its early days, through major breakthroughs, +such as attention mechanisms, compositional reasoning, and the rise of +vision-language pre-training methods. We highlight key models, datasets, and +techniques that shaped the development of VQA systems, emphasizing the pivotal +role of transformer architectures and multimodal pre-training in driving recent +progress. Additionally, we explore specialized applications of VQA in domains +like healthcare and discuss ongoing challenges, such as dataset bias, model +interpretability, and the need for common-sense reasoning. Lastly, we discuss +the emerging trends in large multimodal language models and the integration of +external knowledge, offering insights into the future directions of VQA. This +paper aims to provide a comprehensive overview of the evolution of VQA, +highlighting both its current state and potential advancements. + +
+
+
+
+
+ + ☆ RMAvatar: Photorealistic Human Avatar Reconstruction from Monocular + Video Based on Rectified Mesh-embedded Gaussians + + +
+ We introduce RMAvatar, a novel human avatar representation with Gaussian +splatting embedded on mesh to learn clothed avatar from a monocular video. We +utilize the explicit mesh geometry to represent motion and shape of a virtual +human and implicit appearance rendering with Gaussian Splatting. Our method +consists of two main modules: Gaussian initialization module and Gaussian +rectification module. We embed Gaussians into triangular faces and control +their motion through the mesh, which ensures low-frequency motion and surface +deformation of the avatar. Due to the limitations of LBS formula, the human +skeleton is hard to control complex non-rigid transformations. We then design a +pose-related Gaussian rectification module to learn fine-detailed non-rigid +deformations, further improving the realism and expressiveness of the avatar. +We conduct extensive experiments on public datasets, RMAvatar shows +state-of-the-art performance on both rendering quality and quantitative +evaluations. Please see our project page at https://rm-avatar.github.io. + +
+
+ comment: CVM2025 +
+
+
+
+
+ + ☆ Dual Scale-aware Adaptive Masked Knowledge Distillation for Object + Detection + + +
+ Recent feature masking knowledge distillation methods make use of attention +mechanisms to identify either important spatial regions or channel clues for +discriminative feature reconstruction. However, most of existing strategies +perform global attention-guided feature masking distillation without delving +into fine-grained visual clues in feature maps. In particular, uncovering +locality-aware clues across different scales are conducive to reconstructing +region-aware features, thereby significantly benefiting distillation +performance. In this study, we propose a fine-grained adaptive feature masking +distillation framework for accurate object detection. Different from previous +methods in which global masking is performed on single-scale feature maps, we +explore the scale-aware feature masking by performing feature distillation +across various scales, such that the object-aware locality is encoded for +improved feature reconstruction. In addition, our fine-grained feature +distillation strategy is combined with a masking logits distillation scheme in +which logits difference between teacher and student networks is utilized to +guide the distillation process. Thus, it can help the student model to better +learn from the teacher counterpart with improved knowledge transfer. Extensive +experiments for detection task demonstrate the superiority of our method. For +example, when RetinaNet, RepPoints and Cascade Mask RCNN are used as teacher +detectors, the student network achieves mAP scores of 41.5\%, 42.9\%, and +42.6\%, respectively, outperforming state-of-the-art methods such as DMKD and +FreeKD. + +
+
+
+
+
+ + ☆ Collaborative Learning for 3D Hand-Object Reconstruction and + Compositional Action Recognition from Egocentric RGB Videos Using + Superquadrics + + +
+ With the availability of egocentric 3D hand-object interaction datasets, +there is increasing interest in developing unified models for hand-object pose +estimation and action recognition. However, existing methods still struggle to +recognise seen actions on unseen objects due to the limitations in representing +object shape and movement using 3D bounding boxes. Additionally, the reliance +on object templates at test time limits their generalisability to unseen +objects. To address these challenges, we propose to leverage superquadrics as +an alternative 3D object representation to bounding boxes and demonstrate their +effectiveness on both template-free object reconstruction and action +recognition tasks. Moreover, as we find that pure appearance-based methods can +outperform the unified methods, the potential benefits from 3D geometric +information remain unclear. Therefore, we study the compositionality of actions +by considering a more challenging task where the training combinations of verbs +and nouns do not overlap with the testing split. We extend H2O and FPHA +datasets with compositional splits and design a novel collaborative learning +framework that can explicitly reason about the geometric relations between +hands and the manipulated object. Through extensive quantitative and +qualitative evaluations, we demonstrate significant improvements over the +state-of-the-arts in (compositional) action recognition. + +
+
+ comment: Accepted to AAAI 2025 +
+
+
+
+
+ + ☆ Video Quality Assessment for Online Processing: From Spatial to Temporal + Sampling + + +
+ With the rapid development of multimedia processing and deep learning +technologies, especially in the field of video understanding, video quality +assessment (VQA) has achieved significant progress. Although researchers have +moved from designing efficient video quality mapping models to various research +directions, in-depth exploration of the effectiveness-efficiency trade-offs of +spatio-temporal modeling in VQA models is still less sufficient. Considering +the fact that videos have highly redundant information, this paper investigates +this problem from the perspective of joint spatial and temporal sampling, +aiming to seek the answer to how little information we should keep at least +when feeding videos into the VQA models while with acceptable performance +sacrifice. To this end, we drastically sample the video's information from both +spatial and temporal dimensions, and the heavily squeezed video is then fed +into a stable VQA model. Comprehensive experiments regarding joint spatial and +temporal sampling are conducted on six public video quality databases, and the +results demonstrate the acceptable performance of the VQA model when throwing +away most of the video information. Furthermore, with the proposed joint +spatial and temporal sampling strategy, we make an initial attempt to design an +online VQA model, which is instantiated by as simple as possible a spatial +feature extractor, a temporal feature fusion module, and a global quality +regression module. Through quantitative and qualitative experiments, we verify +the feasibility of online VQA model by simplifying itself and reducing input. + +
+
+
+
+
+ + ☆ Representation Learning of Point Cloud Upsampling in Global and Local + Inputs + + +
+ In recent years, point cloud upsampling has been widely applied in fields +such as 3D reconstruction. Our study investigates the factors influencing point +cloud upsampling on both global and local levels through representation +learning. Specifically, the paper inputs global and local information of the +same point cloud model object into two encoders to extract these features, +fuses them, and then feeds the combined features into an upsampling decoder. +The goal is to address issues of sparsity and noise in point clouds by +leveraging prior knowledge from both global and local inputs. And the proposed +framework can be applied to any state-of-the-art point cloud upsampling neural +network. Experiments were conducted on a series of autoencoder-based models +utilizing deep learning, yielding interpretability for both global and local +inputs, and it has been proven in the results that our proposed framework can +further improve the upsampling effect in previous SOTA works. At the same time, +the Saliency Map reflects the differences between global and local feature +inputs, as well as the effectiveness of training with both inputs in parallel. + +
+
+
+
+
+ + ☆ Label Calibration in Source Free Domain Adaptation + + +
+ Source-free domain adaptation (SFDA) utilizes a pre-trained source model with +unlabeled target data. Self-supervised SFDA techniques generate pseudolabels +from the pre-trained source model, but these pseudolabels often contain noise +due to domain discrepancies between the source and target domains. Traditional +self-supervised SFDA techniques rely on deterministic model predictions using +the softmax function, leading to unreliable pseudolabels. In this work, we +propose to introduce predictive uncertainty and softmax calibration for +pseudolabel refinement using evidential deep learning. The Dirichlet prior is +placed over the output of the target network to capture uncertainty using +evidence with a single forward pass. Furthermore, softmax calibration solves +the translation invariance problem to assist in learning with noisy labels. We +incorporate a combination of evidential deep learning loss and information +maximization loss with calibrated softmax in both prior and non-prior target +knowledge SFDA settings. Extensive experimental analysis shows that our method +outperforms other state-of-the-art methods on benchmark datasets. + +
+
+ comment: Accepted in IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) 2025 +
+
+
+
+
+ + ☆ Enhancing Image Generation Fidelity via Progressive Prompts + + +
+ The diffusion transformer (DiT) architecture has attracted significant +attention in image generation, achieving better fidelity, performance, and +diversity. However, most existing DiT - based image generation methods focus on +global - aware synthesis, and regional prompt control has been less explored. +In this paper, we propose a coarse - to - fine generation pipeline for regional +prompt - following generation. Specifically, we first utilize the powerful +large language model (LLM) to generate both high - level descriptions of the +image (such as content, topic, and objects) and low - level descriptions (such +as details and style). Then, we explore the influence of cross - attention +layers at different depths. We find that deeper layers are always responsible +for high - level content control, while shallow layers handle low - level +content control. Various prompts are injected into the proposed regional cross +- attention control for coarse - to - fine generation. By using the proposed +pipeline, we enhance the controllability of DiT - based image generation. +Extensive quantitative and qualitative results show that our pipeline can +improve the performance of the generated images. + +
+
+ comment: Accepted by ICASSP 2025, Github: + https://github.com/ZhenXiong-dl/ICASSP2025-RCAC +
+
+
+
+
+ + ☆ Hierarchical Superpixel Segmentation via Structural Information Theory + + +
+ Superpixel segmentation is a foundation for many higher-level computer vision +tasks, such as image segmentation, object recognition, and scene understanding. +Existing graph-based superpixel segmentation methods typically concentrate on +the relationships between a given pixel and its directly adjacent pixels while +overlooking the influence of non-adjacent pixels. These approaches do not fully +leverage the global information in the graph, leading to suboptimal +segmentation quality. To address this limitation, we present SIT-HSS, a +hierarchical superpixel segmentation method based on structural information +theory. Specifically, we first design a novel graph construction strategy that +incrementally explores the pixel neighborhood to add edges based on +1-dimensional structural entropy (1D SE). This strategy maximizes the retention +of graph information while avoiding an overly complex graph structure. Then, we +design a new 2D SE-guided hierarchical graph partitioning method, which +iteratively merges pixel clusters layer by layer to reduce the graph's 2D SE +until a predefined segmentation scale is achieved. Experimental results on +three benchmark datasets demonstrate that the SIT-HSS performs better than +state-of-the-art unsupervised superpixel segmentation algorithms. The source +code is available at \url{https://github.com/SELGroup/SIT-HSS}. + +
+
+ comment: Accepted by SDM 2025 +
+
+
+
+
+ + ☆ SFC-GAN: A Generative Adversarial Network for Brain Functional and + Structural Connectome Translation + + +
+ Modern brain imaging technologies have enabled the detailed reconstruction of +human brain connectomes, capturing structural connectivity (SC) from diffusion +MRI and functional connectivity (FC) from functional MRI. Understanding the +intricate relationships between SC and FC is vital for gaining deeper insights +into the brain's functional and organizational mechanisms. However, obtaining +both SC and FC modalities simultaneously remains challenging, hindering +comprehensive analyses. Existing deep generative models typically focus on +synthesizing a single modality or unidirectional translation between FC and SC, +thereby missing the potential benefits of bi-directional translation, +especially in scenarios where only one connectome is available. Therefore, we +propose Structural-Functional Connectivity GAN (SFC-GAN), a novel framework for +bidirectional translation between SC and FC. This approach leverages the +CycleGAN architecture, incorporating convolutional layers to effectively +capture the spatial structures of brain connectomes. To preserve the +topological integrity of these connectomes, we employ a structure-preserving +loss that guides the model in capturing both global and local connectome +patterns while maintaining symmetry. Our framework demonstrates superior +performance in translating between SC and FC, outperforming baseline models in +similarity and graph property evaluations compared to ground truth data, each +translated modality can be effectively utilized for downstream classification. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ☆ Protego: Detecting Adversarial Examples for Vision Transformers via + Intrinsic Capabilities + + +
+ Transformer models have excelled in natural language tasks, prompting the +vision community to explore their implementation in computer vision problems. +However, these models are still influenced by adversarial examples. In this +paper, we investigate the attack capabilities of six common adversarial attacks +on three pretrained ViT models to reveal the vulnerability of ViT models. To +understand and analyse the bias in neural network decisions when the input is +adversarial, we use two visualisation techniques that are attention rollout and +grad attention rollout. To prevent ViT models from adversarial attack, we +propose Protego, a detection framework that leverages the transformer intrinsic +capabilities to detection adversarial examples of ViT models. Nonetheless, this +is challenging due to a diversity of attack strategies that may be adopted by +adversaries. Inspired by the attention mechanism, we know that the token of +prediction contains all the information from the input sample. Additionally, +the attention region for adversarial examples differs from that of normal +examples. Given these points, we can train a detector that achieves superior +performance than existing detection methods to identify adversarial examples. +Our experiments have demonstrated the high effectiveness of our detection +method. For these six adversarial attack methods, our detector's AUC scores all +exceed 0.95. Protego may advance investigations in metaverse security. + +
+
+ comment: Accepted by IEEE MetaCom 2024 +
+
+
+
+
+ + ☆ Rethinking Knowledge in Distillation: An In-context Sample Retrieval + Perspective + + +
+ Conventional knowledge distillation (KD) approaches are designed for the +student model to predict similar output as the teacher model for each sample. +Unfortunately, the relationship across samples with same class is often +neglected. In this paper, we explore to redefine the knowledge in distillation, +capturing the relationship between each sample and its corresponding in-context +samples (a group of similar samples with the same or different classes), and +perform KD from an in-context sample retrieval perspective. As KD is a type of +learned label smoothing regularization (LSR), we first conduct a theoretical +analysis showing that the teacher's knowledge from the in-context samples is a +crucial contributor to regularize the student training with the corresponding +samples. Buttressed by the analysis, we propose a novel in-context knowledge +distillation (IC-KD) framework that shows its superiority across diverse KD +paradigms (offline, online, and teacher-free KD). Firstly, we construct a +feature memory bank from the teacher model and retrieve in-context samples for +each corresponding sample through retrieval-based learning. We then introduce +Positive In-Context Distillation (PICD) to reduce the discrepancy between a +sample from the student and the aggregated in-context samples with the same +class from the teacher in the logit space. Moreover, Negative In-Context +Distillation (NICD) is introduced to separate a sample from the student and the +in-context samples with different classes from the teacher in the logit space. +Extensive experiments demonstrate that IC-KD is effective across various types +of KD, and consistently achieves state-of-the-art performance on CIFAR-100 and +ImageNet datasets. + +
+
+
+
+
+ + ☆ IoT-Based Real-Time Medical-Related Human Activity Recognition Using + Skeletons and Multi-Stage Deep Learning for Healthcare + + +
+ The Internet of Things (IoT) and mobile technology have significantly +transformed healthcare by enabling real-time monitoring and diagnosis of +patients. Recognizing medical-related human activities (MRHA) is pivotal for +healthcare systems, particularly for identifying actions that are critical to +patient well-being. However, challenges such as high computational demands, low +accuracy, and limited adaptability persist in Human Motion Recognition (HMR). +While some studies have integrated HMR with IoT for real-time healthcare +applications, limited research has focused on recognizing MRHA as essential for +effective patient monitoring. This study proposes a novel HMR method for MRHA +detection, leveraging multi-stage deep learning techniques integrated with IoT. +The approach employs EfficientNet to extract optimized spatial features from +skeleton frame sequences using seven Mobile Inverted Bottleneck Convolutions +(MBConv) blocks, followed by ConvLSTM to capture spatio-temporal patterns. A +classification module with global average pooling, a fully connected layer, and +a dropout layer generates the final predictions. The model is evaluated on the +NTU RGB+D 120 and HMDB51 datasets, focusing on MRHA, such as sneezing, falling, +walking, sitting, etc. It achieves 94.85% accuracy for cross-subject +evaluations and 96.45% for cross-view evaluations on NTU RGB+D 120, along with +89.00% accuracy on HMDB51. Additionally, the system integrates IoT capabilities +using a Raspberry Pi and GSM module, delivering real-time alerts via Twilios +SMS service to caregivers and patients. This scalable and efficient solution +bridges the gap between HMR and IoT, advancing patient monitoring, improving +healthcare outcomes, and reducing costs. + +
+
+
+
+
+ + ☆ Detection of AI Deepfake and Fraud in Online Payments Using GAN-Based + Models + + +
+ This study explores the use of Generative Adversarial Networks (GANs) to +detect AI deepfakes and fraudulent activities in online payment systems. With +the growing prevalence of deepfake technology, which can manipulate facial +features in images and videos, the potential for fraud in online transactions +has escalated. Traditional security systems struggle to identify these +sophisticated forms of fraud. This research proposes a novel GAN-based model +that enhances online payment security by identifying subtle manipulations in +payment images. The model is trained on a dataset consisting of real-world +online payment images and deepfake images generated using advanced GAN +architectures, such as StyleGAN and DeepFake. The results demonstrate that the +proposed model can accurately distinguish between legitimate transactions and +deepfakes, achieving a high detection rate above 95%. This approach +significantly improves the robustness of payment systems against AI-driven +fraud. The paper contributes to the growing field of digital security, offering +insights into the application of GANs for fraud detection in financial +services. Keywords- Payment Security, Image Recognition, Generative Adversarial +Networks, AI Deepfake, Fraudulent Activities + +
+
+ comment: The paper will be published and indexed by IEEE at 2025 8th + International Conference on Advanced Algorithms and Control Engineering + (ICAACE 2025) +
+
+
+
+
+ + ☆ UNetVL: Enhancing 3D Medical Image Segmentation with Chebyshev KAN + Powered Vision-LSTM + + +
+ 3D medical image segmentation has progressed considerably due to +Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs), yet these +methods struggle to balance long-range dependency acquisition with +computational efficiency. To address this challenge, we propose UNETVL (U-Net +Vision-LSTM), a novel architecture that leverages recent advancements in +temporal information processing. UNETVL incorporates Vision-LSTM (ViL) for +improved scalability and memory functions, alongside an efficient Chebyshev +Kolmogorov-Arnold Networks (KAN) to handle complex and long-range dependency +patterns more effectively. We validated our method on the ACDC and AMOS2022 +(post challenge Task 2) benchmark datasets, showing a significant improvement +in mean Dice score compared to recent state-of-the-art approaches, especially +over its predecessor, UNETR, with increases of 7.3% on ACDC and 15.6% on AMOS, +respectively. Extensive ablation studies were conducted to demonstrate the +impact of each component in UNETVL, providing a comprehensive understanding of +its architecture. Our code is available at https://github.com/tgrex6/UNETVL, +facilitating further research and applications in this domain. + +
+
+
+
+
+ + ☆ A Multi-Modal Deep Learning Framework for Pan-Cancer Prognosis + + +
+ Prognostic task is of great importance as it closely related to the survival +analysis of patients, the optimization of treatment plans and the allocation of +resources. The existing prognostic models have shown promising results on +specific datasets, but there are limitations in two aspects. On the one hand, +they merely explore certain types of modal data, such as patient histopathology +WSI and gene expression analysis. On the other hand, they adopt the +per-cancer-per-model paradigm, which means the trained models can only predict +the prognostic effect of a single type of cancer, resulting in weak +generalization ability. In this paper, a deep-learning based model, named +UMPSNet, is proposed. Specifically, to comprehensively understand the condition +of patients, in addition to constructing encoders for histopathology images and +genomic expression profiles respectively, UMPSNet further integrates four types +of important meta data (demographic information, cancer type information, +treatment protocols, and diagnosis results) into text templates, and then +introduces a text encoder to extract textual features. In addition, the optimal +transport OT-based attention mechanism is utilized to align and fuse features +of different modalities. Furthermore, a guided soft mixture of experts (GMoE) +mechanism is introduced to effectively address the issue of distribution +differences among multiple cancer datasets. By incorporating the multi-modality +of patient data and joint training, UMPSNet outperforms all SOTA approaches, +and moreover, it demonstrates the effectiveness and generalization ability of +the proposed learning paradigm of a single model for multiple cancer types. The +code of UMPSNet is available at https://github.com/binging512/UMPSNet. + +
+
+
+
+
+ + ☆ SplatMAP: Online Dense Monocular SLAM with 3D Gaussian Splatting + + +
+ Achieving high-fidelity 3D reconstruction from monocular video remains +challenging due to the inherent limitations of traditional methods like +Structure-from-Motion (SfM) and monocular SLAM in accurately capturing scene +details. While differentiable rendering techniques such as Neural Radiance +Fields (NeRF) address some of these challenges, their high computational costs +make them unsuitable for real-time applications. Additionally, existing 3D +Gaussian Splatting (3DGS) methods often focus on photometric consistency, +neglecting geometric accuracy and failing to exploit SLAM's dynamic depth and +pose updates for scene refinement. We propose a framework integrating dense +SLAM with 3DGS for real-time, high-fidelity dense reconstruction. Our approach +introduces SLAM-Informed Adaptive Densification, which dynamically updates and +densifies the Gaussian model by leveraging dense point clouds from SLAM. +Additionally, we incorporate Geometry-Guided Optimization, which combines +edge-aware geometric constraints and photometric consistency to jointly +optimize the appearance and geometry of the 3DGS scene representation, enabling +detailed and accurate SLAM mapping reconstruction. Experiments on the Replica +and TUM-RGBD datasets demonstrate the effectiveness of our approach, achieving +state-of-the-art results among monocular systems. Specifically, our method +achieves a PSNR of 36.864, SSIM of 0.985, and LPIPS of 0.040 on Replica, +representing improvements of 10.7%, 6.4%, and 49.4%, respectively, over the +previous SOTA. On TUM-RGBD, our method outperforms the closest baseline by +10.2%, 6.6%, and 34.7% in the same metrics. These results highlight the +potential of our framework in bridging the gap between photometric and +geometric dense 3D scene representations, paving the way for practical and +efficient monocular dense reconstruction. + +
+
+
+
+
+ + ☆ LEO: Boosting Mixture of Vision Encoders for Multimodal Large Language + Models + + +
+ Enhanced visual understanding serves as a cornerstone for multimodal large +language models (MLLMs). Recent hybrid MLLMs incorporate a mixture of vision +experts to address the limitations of using a single vision encoder and +excessively long visual tokens. Despite the progress of these MLLMs, a research +gap remains in effectively integrating diverse vision encoders. This work +explores fusion strategies of visual tokens for hybrid MLLMs, leading to the +design of LEO, a novel MLLM with a dual-branch vision encoder framework that +incorporates a post-adaptation fusion strategy and adaptive tiling: for each +segmented tile of the input images, LEO sequentially interleaves the visual +tokens from its two vision encoders. Extensive evaluation across 13 +vision-language benchmarks reveals that LEO outperforms state-of-the-art +open-source MLLMs and hybrid MLLMs on the majority of tasks. Furthermore, we +show that LEO can be adapted to the specialized domain of autonomous driving +without altering the model architecture or training recipe, achieving +competitive performance compared to existing baselines. The code and model will +be publicly available. + +
+
+
+
+
+ + ☆ Universal Training of Neural Networks to Achieve Bayes Optimal + Classification Accuracy + + +
+ This work invokes the notion of $f$-divergence to introduce a novel upper +bound on the Bayes error rate of a general classification task. We show that +the proposed bound can be computed by sampling from the output of a +parameterized model. Using this practical interpretation, we introduce the +Bayes optimal learning threshold (BOLT) loss whose minimization enforces a +classification model to achieve the Bayes error rate. We validate the proposed +loss for image and text classification tasks, considering MNIST, Fashion-MNIST, +CIFAR-10, and IMDb datasets. Numerical experiments demonstrate that models +trained with BOLT achieve performance on par with or exceeding that of +cross-entropy, particularly on challenging datasets. This highlights the +potential of BOLT in improving generalization. + +
+
+ comment: Accepted to ICASSP 2025 +
+
+
+
+
+ + ☆ Boosting Sclera Segmentation through Semi-supervised Learning with Fewer + Labels + + +
+ Sclera segmentation is crucial for developing automatic eye-related medical +computer-aided diagnostic systems, as well as for personal identification and +verification, because the sclera contains distinct personal features. Deep +learning-based sclera segmentation has achieved significant success compared to +traditional methods that rely on hand-crafted features, primarily because it +can autonomously extract critical output-related features without the need to +consider potential physical constraints. However, achieving accurate sclera +segmentation using these methods is challenging due to the scarcity of +high-quality, fully labeled datasets, which depend on costly, labor-intensive +medical acquisition and expertise. To address this challenge, this paper +introduces a novel sclera segmentation framework that excels with limited +labeled samples. Specifically, we employ a semi-supervised learning method that +integrates domain-specific improvements and image-based spatial transformations +to enhance segmentation performance. Additionally, we have developed a +real-world eye diagnosis dataset to enrich the evaluation process. Extensive +experiments on our dataset and two additional public datasets demonstrate the +effectiveness and superiority of our proposed method, especially with +significantly fewer labeled samples. + +
+
+ comment: Under review, 19 pages, 9 figures, 4 tables +
+
+
+
+
+ + ☆ A Heterogeneous Multimodal Graph Learning Framework for Recognizing User + Emotions in Social Networks + + +
+ The rapid expansion of social media platforms has provided unprecedented +access to massive amounts of multimodal user-generated content. Comprehending +user emotions can provide valuable insights for improving communication and +understanding of human behaviors. Despite significant advancements in Affective +Computing, the diverse factors influencing user emotions in social networks +remain relatively understudied. Moreover, there is a notable lack of deep +learning-based methods for predicting user emotions in social networks, which +could be addressed by leveraging the extensive multimodal data available. This +work presents a novel formulation of personalized emotion prediction in social +networks based on heterogeneous graph learning. Building upon this formulation, +we design HMG-Emo, a Heterogeneous Multimodal Graph Learning Framework that +utilizes deep learning-based features for user emotion recognition. +Additionally, we include a dynamic context fusion module in HMG-Emo that is +capable of adaptively integrating the different modalities in social media +data. Through extensive experiments, we demonstrate the effectiveness of +HMG-Emo and verify the superiority of adopting a graph neural network-based +approach, which outperforms existing baselines that use rich hand-crafted +features. To the best of our knowledge, HMG-Emo is the first multimodal and +deep-learning-based approach to predict personalized emotions within online +social networks. Our work highlights the significance of exploiting advanced +deep learning techniques for less-explored problems in Affective Computing. + +
+
+
+
+
+ + ☆ Fixing the Scale and Shift in Monocular Depth For Camera Pose Estimation + + +
+ Recent advances in monocular depth prediction have led to significantly +improved depth prediction accuracy. In turn, this enables various applications +to use such depth predictions. In this paper, we propose a novel framework for +estimating the relative pose between two cameras from point correspondences +with associated monocular depths. Since depth predictions are typically defined +up to an unknown scale and shift parameter, our solvers jointly estimate both +scale and shift parameters together with the camera pose. We derive efficient +solvers for three cases: (1) two calibrated cameras, (2) two uncalibrated +cameras with an unknown but shared focal length, and (3) two uncalibrated +cameras with unknown and different focal lengths. Experiments on synthetic and +real data, including experiments with depth maps estimated by 11 different +depth predictors, show the practical viability of our solvers. Compared to +prior work, our solvers achieve state-of-the-art results on two large-scale, +real-world datasets. The source code is available at +https://github.com/yaqding/pose_monodepth + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Democratizing Text-to-Image Masked Generative Models with Compact + Text-Aware One-Dimensional Tokens + + +
+ Image tokenizers form the foundation of modern text-to-image generative +models but are notoriously difficult to train. Furthermore, most existing +text-to-image models rely on large-scale, high-quality private datasets, making +them challenging to replicate. In this work, we introduce Text-Aware +Transformer-based 1-Dimensional Tokenizer (TA-TiTok), an efficient and powerful +image tokenizer that can utilize either discrete or continuous 1-dimensional +tokens. TA-TiTok uniquely integrates textual information during the tokenizer +decoding stage (i.e., de-tokenization), accelerating convergence and enhancing +performance. TA-TiTok also benefits from a simplified, yet effective, one-stage +training process, eliminating the need for the complex two-stage distillation +used in previous 1-dimensional tokenizers. This design allows for seamless +scalability to large datasets. Building on this, we introduce a family of +text-to-image Masked Generative Models (MaskGen), trained exclusively on open +data while achieving comparable performance to models trained on private data. +We aim to release both the efficient, strong TA-TiTok tokenizers and the +open-data, open-weight MaskGen models to promote broader access and democratize +the field of text-to-image masked generative models. + +
+
+ comment: Project page at https://tacju.github.io/projects/maskgen.html +
+
+
+
+
+ + ☆ Testing Human-Hand Segmentation on In-Distribution and + Out-of-Distribution Data in Human-Robot Interactions Using a Deep Ensemble + Model + + +
+ Reliable detection and segmentation of human hands are critical for enhancing +safety and facilitating advanced interactions in human-robot collaboration. +Current research predominantly evaluates hand segmentation under +in-distribution (ID) data, which reflects the training data of deep learning +(DL) models. However, this approach fails to address out-of-distribution (OOD) +scenarios that often arise in real-world human-robot interactions. In this +study, we present a novel approach by evaluating the performance of pre-trained +DL models under both ID data and more challenging OOD scenarios. To mimic +realistic industrial scenarios, we designed a diverse dataset featuring simple +and cluttered backgrounds with industrial tools, varying numbers of hands (0 to +4), and hands with and without gloves. For OOD scenarios, we incorporated +unique and rare conditions such as finger-crossing gestures and motion blur +from fast-moving hands, addressing both epistemic and aleatoric uncertainties. +To ensure multiple point of views (PoVs), we utilized both egocentric cameras, +mounted on the operator's head, and static cameras to capture RGB images of +human-robot interactions. This approach allowed us to account for multiple +camera perspectives while also evaluating the performance of models trained on +existing egocentric datasets as well as static-camera datasets. For +segmentation, we used a deep ensemble model composed of UNet and RefineNet as +base learners. Performance evaluation was conducted using segmentation metrics +and uncertainty quantification via predictive entropy. Results revealed that +models trained on industrial datasets outperformed those trained on +non-industrial datasets, highlighting the importance of context-specific +training. Although all models struggled with OOD scenarios, those trained on +industrial datasets demonstrated significantly better generalization. + +
+
+
+
+
+ + ☆ Pedestrian Trajectory Prediction Based on Social Interactions Learning + With Random Weights + + +
+ Pedestrian trajectory prediction is a critical technology in the evolution of +self-driving cars toward complete artificial intelligence. Over recent years, +focusing on the trajectories of pedestrians to model their social interactions +has surged with great interest in more accurate trajectory predictions. +However, existing methods for modeling pedestrian social interactions rely on +pre-defined rules, struggling to capture non-explicit social interactions. In +this work, we propose a novel framework named DTGAN, which extends the +application of Generative Adversarial Networks (GANs) to graph sequence data, +with the primary objective of automatically capturing implicit social +interactions and achieving precise predictions of pedestrian trajectory. DTGAN +innovatively incorporates random weights within each graph to eliminate the +need for pre-defined interaction rules. We further enhance the performance of +DTGAN by exploring diverse task loss functions during adversarial training, +which yields improvements of 16.7\% and 39.3\% on metrics ADE and FDE, +respectively. The effectiveness and accuracy of our framework are verified on +two public datasets. The experimental results show that our proposed DTGAN +achieves superior performance and is well able to understand pedestrians' +intentions. + +
+
+ comment: 13 pages,7 figures,Accepted to IEEE Transactions on Multimedia (TMM) +
+
+
+
+
+ + ☆ C2PD: Continuity-Constrained Pixelwise Deformation for Guided Depth + Super-Resolution + + +
+ Guided depth super-resolution (GDSR) has demonstrated impressive performance +across a wide range of domains, with numerous methods being proposed. However, +existing methods often treat depth maps as images, where shading values are +computed discretely, making them struggle to effectively restore the continuity +inherent in the depth map. In this paper, we propose a novel approach that +maximizes the utilization of spatial characteristics in depth, coupled with +human abstract perception of real-world substance, by transforming the GDSR +issue into deformation of a roughcast with ideal plasticity, which can be +deformed by force like a continuous object. Specifically, we firstly designed a +cross-modal operation, Continuity-constrained Asymmetrical Pixelwise Operation +(CAPO), which can mimic the process of deforming an isovolumetrically flexible +object through external forces. Utilizing CAPO as the fundamental component, we +develop the Pixelwise Cross Gradient Deformation (PCGD), which is capable of +emulating operations on ideal plastic objects (without volume constraint). +Notably, our approach demonstrates state-of-the-art performance across four +widely adopted benchmarks for GDSR, with significant advantages in large-scale +tasks and generalizability. + +
+
+
+
+
+ + ☆ Dataset Distillation as Pushforward Optimal Quantization + + +
+ Dataset distillation aims to find a synthetic training set such that training +on the synthetic data achieves similar performance to training on real data, +with orders of magnitude less computational requirements. Existing methods can +be broadly categorized as either bi-level optimization problems that have +neural network training heuristics as the lower level problem, or disentangled +methods that bypass the bi-level optimization by matching distributions of +data. The latter method has the major advantages of speed and scalability in +terms of size of both training and distilled datasets. We demonstrate that when +equipped with an encoder-decoder structure, the empirically successful +disentangled methods can be reformulated as an optimal quantization problem, +where a finite set of points is found to approximate the underlying probability +measure by minimizing the expected projection distance. In particular, we link +existing disentangled dataset distillation methods to the classical optimal +quantization and Wasserstein barycenter problems, demonstrating consistency of +distilled datasets for diffusion-based generative priors. We propose a simple +extension of the state-of-the-art data distillation method D4M, achieving +better performance on the ImageNet-1K dataset with trivial additional +computation, and state-of-the-art performance in higher image-per-class +settings. + +
+
+
+
+
+ + ☆ BlobGEN-Vid: Compositional Text-to-Video Generation with Blob Video + Representations + + +
+ Existing video generation models struggle to follow complex text prompts and +synthesize multiple objects, raising the need for additional grounding input +for improved controllability. In this work, we propose to decompose videos into +visual primitives - blob video representation, a general representation for +controllable video generation. Based on blob conditions, we develop a +blob-grounded video diffusion model named BlobGEN-Vid that allows users to +control object motions and fine-grained object appearance. In particular, we +introduce a masked 3D attention module that effectively improves regional +consistency across frames. In addition, we introduce a learnable module to +interpolate text embeddings so that users can control semantics in specific +frames and obtain smooth object transitions. We show that our framework is +model-agnostic and build BlobGEN-Vid based on both U-Net and DiT-based video +diffusion models. Extensive experimental results show that BlobGEN-Vid achieves +superior zero-shot video generation ability and state-of-the-art layout +controllability on multiple benchmarks. When combined with an LLM for layout +planning, our framework even outperforms proprietary text-to-video generators +in terms of compositional accuracy. + +
+
+ comment: Project page: https://blobgen-vid2.github.io/ +
+
+
+
+
+ + ♻ ☆ The Sound of Water: Inferring Physical Properties from Pouring Liquids + + +
+ We study the connection between audio-visual observations and the underlying +physics of a mundane yet intriguing everyday activity: pouring liquids. Given +only the sound of liquid pouring into a container, our objective is to +automatically infer physical properties such as the liquid level, the shape and +size of the container, the pouring rate and the time to fill. To this end, we: +(i) show in theory that these properties can be determined from the fundamental +frequency (pitch); (ii) train a pitch detection model with supervision from +simulated data and visual data with a physics-inspired objective; (iii) +introduce a new large dataset of real pouring videos for a systematic study; +(iv) show that the trained model can indeed infer these physical properties for +real data; and finally, (v) we demonstrate strong generalization to various +container shapes, other datasets, and in-the-wild YouTube videos. Our work +presents a keen understanding of a narrow yet rich problem at the intersection +of acoustics, physics, and learning. It opens up applications to enhance +multisensory perception in robotic pouring. + +
+
+ comment: Project page at https://bpiyush.github.io/pouring-water-website. + Short version accepted to ICASSP 2025 +
+
+
+
+
+ + ♻ ☆ Robot Synesthesia: A Sound and Emotion Guided AI Painter + + +
+ If a picture paints a thousand words, sound may voice a million. While recent +robotic painting and image synthesis methods have achieved progress in +generating visuals from text inputs, the translation of sound into images is +vastly unexplored. Generally, sound-based interfaces and sonic interactions +have the potential to expand accessibility and control for the user and provide +a means to convey complex emotions and the dynamic aspects of the real world. +In this paper, we propose an approach for using sound and speech to guide a +robotic painting process, known here as robot synesthesia. For general sound, +we encode the simulated paintings and input sounds into the same latent space. +For speech, we decouple speech into its transcribed text and the tone of the +speech. Whereas we use the text to control the content, we estimate the +emotions from the tone to guide the mood of the painting. Our approach has been +fully integrated with FRIDA, a robotic painting framework, adding sound and +speech to FRIDA's existing input modalities, such as text and style. In two +surveys, participants were able to correctly guess the emotion or natural sound +used to generate a given painting more than twice as likely as random chance. +On our sound-guided image manipulation and music-guided paintings, we discuss +the results qualitatively. + +
+
+ comment: 9 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Quilt-1M: One Million Image-Text Pairs for Histopathology + + +
+ Recent accelerations in multi-modal applications have been made possible with +the plethora of image and text data available online. However, the scarcity of +analogous data in the medical field, specifically in histopathology, has slowed +comparable progress. To enable similar representation learning for +histopathology, we turn to YouTube, an untapped resource of videos, offering +$1,087$ hours of valuable educational histopathology videos from expert +clinicians. From YouTube, we curate QUILT: a large-scale vision-language +dataset consisting of $802, 144$ image and text pairs. QUILT was automatically +curated using a mixture of models, including large language models, handcrafted +algorithms, human knowledge databases, and automatic speech recognition. In +comparison, the most comprehensive datasets curated for histopathology amass +only around $200$K samples. We combine QUILT with datasets from other sources, +including Twitter, research papers, and the internet in general, to create an +even larger dataset: QUILT-1M, with $1$M paired image-text samples, marking it +as the largest vision-language histopathology dataset to date. We demonstrate +the value of QUILT-1M by fine-tuning a pre-trained CLIP model. Our model +outperforms state-of-the-art models on both zero-shot and linear probing tasks +for classifying new histopathology images across $13$ diverse patch-level +datasets of $8$ different sub-pathologies and cross-modal retrieval tasks. + +
+
+
+
+
+ + ♻ ☆ Enhance Eye Disease Detection using Learnable Probabilistic Discrete + Latents in Machine Learning Architectures + + +
+ Ocular diseases, including diabetic retinopathy and glaucoma, present a +significant public health challenge due to their high prevalence and potential +for causing vision impairment. Early and accurate diagnosis is crucial for +effective treatment and management. In recent years, deep learning models have +emerged as powerful tools for analysing medical images, such as retina imaging. +However, challenges persist in model relibability and uncertainty estimation, +which are critical for clinical decision-making. This study leverages the +probabilistic framework of Generative Flow Networks (GFlowNets) to learn the +posterior distribution over latent discrete dropout masks for the +classification and analysis of ocular diseases using fundus images. We develop +a robust and generalizable method that utilizes GFlowOut integrated with +ResNet18 and ViT models as the backbone in identifying various ocular +conditions. This study employs a unique set of dropout masks - none, random, +bottomup, and topdown - to enhance model performance in analyzing these fundus +images. Our results demonstrate that our learnable probablistic latents +significantly improves accuracy, outperforming the traditional dropout +approach. We utilize a gradient map calculation method, Grad-CAM, to assess +model explainability, observing that the model accurately focuses on critical +image regions for predictions. The integration of GFlowOut in neural networks +presents a promising advancement in the automated diagnosis of ocular diseases, +with implications for improving clinical workflows and patient outcomes. + +
+
+
+
+
+ + ♻ ☆ RGB-D Indiscernible Object Counting in Underwater Scenes + + +
+ Recently, indiscernible/camouflaged scene understanding has attracted lots of +research attention in the vision community. We further advance the frontier of +this field by systematically studying a new challenge named indiscernible +object counting (IOC), the goal of which is to count objects that are blended +with respect to their surroundings. Due to a lack of appropriate IOC datasets, +we present a large-scale dataset IOCfish5K which contains a total of 5,637 +high-resolution images and 659,024 annotated center points. Our dataset +consists of a large number of indiscernible objects (mainly fish) in underwater +scenes, making the annotation process all the more challenging. IOCfish5K is +superior to existing datasets with indiscernible scenes because of its larger +scale, higher image resolutions, more annotations, and denser scenes. All these +aspects make it the most challenging dataset for IOC so far, supporting +progress in this area. Benefiting from the recent advancements of depth +estimation foundation models, we construct high-quality depth maps for +IOCfish5K by generating pseudo labels using the Depth Anything V2 model. The +RGB-D version of IOCfish5K is named IOCfish5K-D. For benchmarking purposes on +IOCfish5K, we select 14 mainstream methods for object counting and carefully +evaluate them. For multimodal IOCfish5K-D, we evaluate other 4 popular +multimodal counting methods. Furthermore, we propose IOCFormer, a new strong +baseline that combines density and regression branches in a unified framework +and can effectively tackle object counting under concealed scenes. We also +propose IOCFormer-D to enable the effective usage of depth modality in helping +detect and count objects hidden in their environments. Experiments show that +IOCFormer and IOCFormer-D achieve state-of-the-art scores on IOCfish5K and +IOCfish5K-D, respectively. + +
+
+ comment: Journal version. The resources are available at + https://github.com/GuoleiSun/Indiscernible-Object-Counting +
+
+
+
+
+ + ♻ ☆ CMAR-Net: Accurate Cross-Modal 3D SAR Reconstruction of Vehicle Targets + with Sparse Multi-Baseline Data + + +
+ Multi-baseline Synthetic Aperture Radar (SAR) three-dimensional (3D) +tomography is a crucial remote sensing technique that provides 3D resolution +unavailable in conventional SAR imaging. However, achieving high-quality +imaging typically requires multi-angle or full-aperture data, resulting in +significant imaging costs. Recent advancements in sparse 3D SAR, which rely on +data from limited apertures, have gained attention as a cost-effective +alternative. Notably, deep learning techniques have markedly enhanced the +imaging quality of sparse 3D SAR. Despite these advancements, existing methods +primarily depend on high-resolution radar images for supervising the training +of deep neural networks (DNNs). This exclusive dependence on single-modal data +prevents the introduction of complementary information from other data sources, +limiting further improvements in imaging performance. In this paper, we +introduce a Cross-Modal 3D-SAR Reconstruction Network (CMAR-Net) to enhance 3D +SAR imaging by integrating heterogeneous information. Leveraging cross-modal +supervision from 2D optical images and error transfer guaranteed by +differentiable rendering, CMAR-Net achieves efficient training and reconstructs +highly sparse multi-baseline SAR data into visually structured and accurate 3D +images, particularly for vehicle targets. Extensive experiments on simulated +and real-world datasets demonstrate that CMAR-Net significantly outperforms +SOTA sparse reconstruction algorithms based on compressed sensing (CS) and deep +learning (DL). Furthermore, our method eliminates the need for time-consuming +full-aperture data preprocessing and relies solely on computer-rendered optical +images, significantly reducing dataset construction costs. This work highlights +the potential of deep learning for multi-baseline SAR 3D imaging and introduces +a novel framework for radar imaging research through cross-modal learning. + +
+
+
+
+
+ + ♻ ☆ Arc2Avatar: Generating Expressive 3D Avatars from a Single Image via ID + Guidance + + +
+ Inspired by the effectiveness of 3D Gaussian Splatting (3DGS) in +reconstructing detailed 3D scenes within multi-view setups and the emergence of +large 2D human foundation models, we introduce Arc2Avatar, the first SDS-based +method utilizing a human face foundation model as guidance with just a single +image as input. To achieve that, we extend such a model for diverse-view human +head generation by fine-tuning on synthetic data and modifying its +conditioning. Our avatars maintain a dense correspondence with a human face +mesh template, allowing blendshape-based expression generation. This is +achieved through a modified 3DGS approach, connectivity regularizers, and a +strategic initialization tailored for our task. Additionally, we propose an +optional efficient SDS-based correction step to refine the blendshape +expressions, enhancing realism and diversity. Experiments demonstrate that +Arc2Avatar achieves state-of-the-art realism and identity preservation, +effectively addressing color issues by allowing the use of very low guidance, +enabled by our strong identity prior and initialization strategy, without +compromising detail. Please visit https://arc2avatar.github.io for more +resources. + +
+
+ comment: Project Page https://arc2avatar.github.io +
+
+
+
+
+ + ♻ ☆ RAD-DINO: Exploring Scalable Medical Image Encoders Beyond Text + Supervision + + +
+ Language-supervised pre-training has proven to be a valuable method for +extracting semantically meaningful features from images, serving as a +foundational element in multimodal systems within the computer vision and +medical imaging domains. However, the computed features are limited by the +information contained in the text, which is particularly problematic in medical +imaging, where the findings described by radiologists focus on specific +observations. This challenge is compounded by the scarcity of paired +imaging-text data due to concerns over leakage of personal health information. +In this work, we fundamentally challenge the prevailing reliance on language +supervision for learning general-purpose biomedical imaging encoders. We +introduce RAD-DINO, a biomedical image encoder pre-trained solely on unimodal +biomedical imaging data that obtains similar or greater performance than +state-of-the-art biomedical language-supervised models on a diverse range of +benchmarks. Specifically, the quality of learned representations is evaluated +on standard imaging tasks (classification and semantic segmentation), and a +vision-language alignment task (text report generation from images). To further +demonstrate the drawback of language supervision, we show that features from +RAD-DINO correlate with other medical records (e.g., sex or age) better than +language-supervised models, which are generally not mentioned in radiology +reports. Finally, we conduct a series of ablations determining the factors in +RAD-DINO's performance; notably, we observe that RAD-DINO's downstream +performance scales well with the quantity and diversity of training data, +demonstrating that image-only supervision is a scalable approach for training a +foundational biomedical image encoder. Model weights of RAD-DINO trained on +publicly available datasets are available at +https://huggingface.co/microsoft/rad-dino. + +
+
+
+
+
+ + ♻ ☆ Agentic Copyright Watermarking against Adversarial Evidence Forgery with + Purification-Agnostic Curriculum Proxy Learning + + +
+ With the proliferation of AI agents in various domains, protecting the +ownership of AI models has become crucial due to the significant investment in +their development. Unauthorized use and illegal distribution of these models +pose serious threats to intellectual property, necessitating effective +copyright protection measures. Model watermarking has emerged as a key +technique to address this issue, embedding ownership information within models +to assert rightful ownership during copyright disputes. This paper presents +several contributions to model watermarking: a self-authenticating black-box +watermarking protocol using hash techniques, a study on evidence forgery +attacks using adversarial perturbations, a proposed defense involving a +purification step to counter adversarial attacks, and a purification-agnostic +curriculum proxy learning method to enhance watermark robustness and model +performance. Experimental results demonstrate the effectiveness of these +approaches in improving the security, reliability, and performance of +watermarked models. + +
+
+
+
+
+ + ♻ ☆ Rethinking Decoders for Transformer-based Semantic Segmentation: A + Compression Perspective NeurIPS2024 + + +
+ State-of-the-art methods for Transformer-based semantic segmentation +typically adopt Transformer decoders that are used to extract additional +embeddings from image embeddings via cross-attention, refine either or both +types of embeddings via self-attention, and project image embeddings onto the +additional embeddings via dot-product. Despite their remarkable success, these +empirical designs still lack theoretical justifications or interpretations, +thus hindering potentially principled improvements. In this paper, we argue +that there are fundamental connections between semantic segmentation and +compression, especially between the Transformer decoders and Principal +Component Analysis (PCA). From such a perspective, we derive a white-box, fully +attentional DEcoder for PrIncipled semantiC segemenTation (DEPICT), with the +interpretations as follows: 1) the self-attention operator refines image +embeddings to construct an ideal principal subspace that aligns with the +supervision and retains most information; 2) the cross-attention operator seeks +to find a low-rank approximation of the refined image embeddings, which is +expected to be a set of orthonormal bases of the principal subspace and +corresponds to the predefined classes; 3) the dot-product operation yields +compact representation for image embeddings as segmentation masks. Experiments +conducted on dataset ADE20K find that DEPICT consistently outperforms its +black-box counterpart, Segmenter, and it is light weight and more robust. + +
+
+ comment: NeurIPS2024. Code:https://github.com/QishuaiWen/DEPICT/ +
+
+
+
+
+ + ♻ ☆ ScVLM: Enhancing Vision-Language Model for Safety-Critical Event + Understanding + + +
+ Accurately identifying, understanding and describing traffic safety-critical +events (SCEs), including crashes, tire strikes, and near-crashes, is crucial +for advanced driver assistance systems, automated driving systems, and traffic +safety. As SCEs are rare events, most general vision-language models (VLMs) +have not been trained sufficiently to link SCE videos and narratives, which +could lead to hallucinations and missing key safety characteristics. Here, we +introduce ScVLM, a novel hybrid methodology that integrates supervised and +contrastive learning techniques to classify the severity and types of SCEs, as +well as to generate narrative descriptions of SCEs. This approach utilizes +classification to enhance VLMs' comprehension of driving videos and improve the +rationality of event descriptions. The proposed approach is trained on and +evaluated by more than 8,600 SCEs from the Second Strategic Highway Research +Program Naturalistic Driving Study dataset, the largest publicly accessible +driving dataset with videos and SCE annotations. The results demonstrate the +superiority of the proposed approach in generating contextually accurate event +descriptions and mitigating VLM hallucinations. The code will be available at +https://github.com/datadrivenwheels/ScVLM. + +
+
+ comment: To appear in Proceedings of the IEEE/CVF Winter Conference on + Applications of Computer Vision (WACV) 2025 +
+
+
+
+
+ + ♻ ☆ Automation of Quantum Dot Measurement Analysis via Explainable Machine + Learning + + +
+ The rapid development of quantum dot (QD) devices for quantum computing has +necessitated more efficient and automated methods for device characterization +and tuning. This work demonstrates the feasibility and advantages of applying +explainable machine learning techniques to the analysis of quantum dot +measurements, paving the way for further advances in automated and transparent +QD device tuning. Many of the measurements acquired during the tuning process +come in the form of images that need to be properly analyzed to guide the +subsequent tuning steps. By design, features present in such images capture +certain behaviors or states of the measured QD devices. When considered +carefully, such features can aid the control and calibration of QD devices. An +important example of such images are so-called $\textit{triangle plots}$, which +visually represent current flow and reveal characteristics important for QD +device calibration. While image-based classification tools, such as +convolutional neural networks (CNNs), can be used to verify whether a given +measurement is $\textit{good}$ and thus warrants the initiation of the next +phase of tuning, they do not provide any insights into how the device should be +adjusted in the case of $\textit{bad}$ images. This is because CNNs sacrifice +prediction and model intelligibility for high accuracy. To ameliorate this +trade-off, a recent study introduced an image vectorization approach that +relies on the Gabor wavelet transform (Schug $\textit{et al.}$ 2024 +$\textit{Proc. XAI4Sci: Explainable Machine Learning for Sciences Workshop +(AAAI 2024) (Vancouver, Canada)}$ pp 1-6). Here we propose an alternative +vectorization method that involves mathematical modeling of synthetic triangles +to mimic the experimental data. Using explainable boosting machines, we show +that this new method offers superior explainability of model prediction without +sacrificing accuracy. + +
+
+ comment: 20 pages, 5 figures, abbreviated version published in Proceedings of + the XAI4Sci: Explainable machine learning for sciences workshop at AAAI 2024, + (Vancouver, Canada) +
+
+
+
+
+ + ♻ ☆ Class Distance Weighted Cross Entropy Loss for Classification of Disease + Severity + + +
+ Assessing disease severity with ordinal classes, where each class reflects +increasing severity levels, benefits from loss functions designed for this +ordinal structure. Traditional categorical loss functions, like Cross-Entropy +(CE), often perform suboptimally in these scenarios. To address this, we +propose a novel loss function, Class Distance Weighted Cross-Entropy (CDW-CE), +which penalizes misclassifications more severely when the predicted and actual +classes are farther apart. We evaluated CDW-CE using various deep +architectures, comparing its performance against several categorical and +ordinal loss functions. To assess the quality of latent representations, we +used t-distributed stochastic neighbor embedding (t-SNE) and uniform manifold +approximation and projection (UMAP) visualizations, quantified the clustering +quality using the Silhouette Score, and compared Class Activation Maps (CAM) +generated by models trained with CDW-CE and CE loss. Feedback from domain +experts was incorporated to evaluate how well model attention aligns with +expert opinion. Our results show that CDW-CE consistently improves performance +in ordinal image classification tasks. It achieves higher Silhouette Scores, +indicating better class discrimination capability, and its CAM visualizations +show a stronger focus on clinically significant regions, as validated by domain +experts. Receiver operator characteristics (ROC) curves and the area under the +curve (AUC) scores highlight that CDW-CE outperforms other loss functions, +including prominent ordinal loss functions from the literature. + +
+
+
+
+
+ + ♻ ☆ FusionSORT: Fusion Methods for Online Multi-object Visual Tracking + + +
+ In this work, we investigate four different fusion methods for associating +detections to tracklets in multi-object visual tracking. In addition to +considering strong cues such as motion and appearance information, we also +consider weak cues such as height intersection-over-union (height-IoU) and +tracklet confidence information in the data association using different fusion +methods. These fusion methods include minimum, weighted sum based on IoU, +Kalman filter (KF) gating, and hadamard product of costs due to the different +cues. We conduct extensive evaluations on validation sets of MOT17, MOT20 and +DanceTrack datasets, and find out that the choice of a fusion method is key for +data association in multi-object visual tracking. We hope that this +investigative work helps the computer vision research community to use the +right fusion method for data association in multi-object visual tracking. + +
+
+
+
+
+ + ♻ ☆ Light Transport-aware Diffusion Posterior Sampling for Single-View + Reconstruction of 3D Volumes + + +
+ We introduce a single-view reconstruction technique of volumetric fields in +which multiple light scattering effects are omnipresent, such as in clouds. We +model the unknown distribution of volumetric fields using an unconditional +diffusion model trained on a novel benchmark dataset comprising 1,000 +synthetically simulated volumetric density fields. The neural diffusion model +is trained on the latent codes of a novel, diffusion-friendly, monoplanar +representation. The generative model is used to incorporate a tailored +parametric diffusion posterior sampling technique into different reconstruction +tasks. A physically-based differentiable volume renderer is employed to provide +gradients with respect to light transport in the latent space. This stands in +contrast to classic NeRF approaches and makes the reconstructions better +aligned with observed data. Through various experiments, we demonstrate +single-view reconstruction of volumetric clouds at a previously unattainable +quality. + +
+
+
+
+
+ + ♻ ☆ Zero-Shot Pupil Segmentation with SAM 2: A Case Study of Over 14 Million + Images + + +
+ We explore the transformative potential of SAM 2, a vision foundation model, +in advancing gaze estimation and eye tracking technologies. By significantly +reducing annotation time, lowering technical barriers through its ease of +deployment, and enhancing segmentation accuracy, SAM 2 addresses critical +challenges faced by researchers and practitioners. Utilizing its zero-shot +segmentation capabilities with minimal user input-a single click per video-we +tested SAM 2 on over 14 million eye images from diverse datasets, including +virtual reality setups and the world's largest unified dataset recorded using +wearable eye trackers. Remarkably, in pupil segmentation tasks, SAM 2 matches +the performance of domain-specific models trained solely on eye images, +achieving competitive mean Intersection over Union (mIoU) scores of up to 93% +without fine-tuning. Additionally, we provide our code and segmentation masks +for these widely used datasets to promote further research. + +
+
+ comment: Virmarie Maquiling and Sean Anthony Byrne contributed equally to this + paper, 8 pages, 3 figures, ETRA 2025, pre-print +
+
+
+
+
+ + ♻ ☆ Expanding Performance Boundaries of Open-Source Multimodal Models with + Model, Data, and Test-Time Scaling + + +
+ We introduce InternVL 2.5, an advanced multimodal large language model (MLLM) +series that builds upon InternVL 2.0, maintaining its core model architecture +while introducing significant enhancements in training and testing strategies +as well as data quality. In this work, we delve into the relationship between +model scaling and performance, systematically exploring the performance trends +in vision encoders, language models, dataset sizes, and test-time +configurations. Through extensive evaluations on a wide range of benchmarks, +including multi-discipline reasoning, document understanding, multi-image / +video understanding, real-world comprehension, multimodal hallucination +detection, visual grounding, multilingual capabilities, and pure language +processing, InternVL 2.5 exhibits competitive performance, rivaling leading +commercial models such as GPT-4o and Claude-3.5-Sonnet. Notably, our model is +the first open-source MLLMs to surpass 70% on the MMMU benchmark, achieving a +3.7-point improvement through Chain-of-Thought (CoT) reasoning and showcasing +strong potential for test-time scaling. We hope this model contributes to the +open-source community by setting new standards for developing and applying +multimodal AI systems. HuggingFace demo see +https://huggingface.co/spaces/OpenGVLab/InternVL + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ BayesAdapter: enhanced uncertainty estimation in CLIP few-shot + adaptation + + +
+ The emergence of large pre-trained vision-language models (VLMs) represents a +paradigm shift in machine learning, with unprecedented results in a broad span +of visual recognition tasks. CLIP, one of the most popular VLMs, has exhibited +remarkable zero-shot and transfer learning capabilities in classification. To +transfer CLIP to downstream tasks, adapters constitute a parameter-efficient +approach that avoids backpropagation through the large model (unlike related +prompt learning methods). However, CLIP adapters have been developed to target +discriminative performance, and the quality of their uncertainty estimates has +been overlooked. In this work we show that the discriminative performance of +state-of-the-art CLIP adapters does not always correlate with their uncertainty +estimation capabilities, which are essential for a safe deployment in +real-world scenarios. We also demonstrate that one of such adapters is obtained +through MAP inference from a more general probabilistic framework. Based on +this observation we introduce BayesAdapter, which leverages Bayesian inference +to estimate a full probability distribution instead of a single point, better +capturing the variability inherent in the parameter space. In a comprehensive +empirical evaluation we show that our approach obtains high quality uncertainty +estimates in the predictions, standing out in calibration and selective +classification. Our code will be publicly available upon acceptance of the +paper. + +
+
+ comment: 30 pages, 5 figures, 23 tables +
+
+
+
+
+ + ♻ ☆ GIM: A Million-scale Benchmark for Generative Image Manipulation + Detection and Localization + + +
+ The extraordinary ability of generative models emerges as a new trend in +image editing and generating realistic images, posing a serious threat to the +trustworthiness of multimedia data and driving the research of image +manipulation detection and location (IMDL). However, the lack of a large-scale +data foundation makes the IMDL task unattainable. In this paper, we build a +local manipulation data generation pipeline that integrates the powerful +capabilities of SAM, LLM, and generative models. Upon this basis, we propose +the GIM dataset, which has the following advantages: 1) Large scale, GIM +includes over one million pairs of AI-manipulated images and real images. 2) +Rich image content, GIM encompasses a broad range of image classes. 3) Diverse +generative manipulation, the images are manipulated images with +state-of-the-art generators and various manipulation tasks. The aforementioned +advantages allow for a more comprehensive evaluation of IMDL methods, extending +their applicability to diverse images. We introduce the GIM benchmark with two +settings to evaluate existing IMDL methods. In addition, we propose a novel +IMDL framework, termed GIMFormer, which consists of a ShadowTracer, +Frequency-Spatial block (FSB), and a Multi-Window Anomalous Modeling (MWAM) +module. Extensive experiments on the GIM demonstrate that GIMFormer surpasses +the previous state-of-the-art approach on two different benchmarks. + +
+
+ comment: Code page: https://github.com/chenyirui/GIM +
+
+
+
+
+ + ♻ ☆ Point-JEPA: A Joint Embedding Predictive Architecture for + Self-Supervised Learning on Point Cloud + + +
+ Recent advancements in self-supervised learning in the point cloud domain +have demonstrated significant potential. However, these methods often suffer +from drawbacks, including lengthy pre-training time, the necessity of +reconstruction in the input space, or the necessity of additional modalities. +In order to address these issues, we introduce Point-JEPA, a joint embedding +predictive architecture designed specifically for point cloud data. To this +end, we introduce a sequencer that orders point cloud patch embeddings to +efficiently compute and utilize their proximity based on the indices during +target and context selection. The sequencer also allows shared computations of +the patch embeddings' proximity between context and target selection, further +improving the efficiency. Experimentally, our method achieves competitive +results with state-of-the-art methods while avoiding the reconstruction in the +input space or additional modality. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ SCC-YOLO: An Improved Object Detector for Assisting in Brain Tumor + Diagnosis + + +
+ Brain tumors can result in neurological dysfunction, alterations in cognitive +and psychological states, increased intracranial pressure, and the occurrence +of seizures, thereby presenting a substantial risk to human life and health. +The You Only Look Once(YOLO) series models have demonstrated superior accuracy +in object detection for medical imaging. In this paper, we develop a novel +SCC-YOLO architecture by integrating the SCConv attention mechanism into +YOLOv9. The SCConv module reconstructs an efficient convolutional module by +reducing spatial and channel redundancy among features, thereby enhancing the +learning of image features. We investigate the impact of intergrating different +attention mechanisms with the YOLOv9 model on brain tumor image detection using +both the Br35H dataset and our self-made dataset(Brain_Tumor_Dataset). +Experimental results show that on the Br35H dataset, SCC-YOLO achieved a 0.3% +improvement in mAp50 compared to YOLOv9, while on our self-made dataset, +SCC-YOLO exhibited a 0.5% improvement over YOLOv9. SCC-YOLO has reached +state-of-the-art performance in brain tumor detection. Source code is available +at : https://jihulab.com/healthcare-information-studio/SCC-YOLO/-/tree/master + +
+
+
+
+
+ + ♻ ☆ Text-Guided Coarse-to-Fine Fusion Network for Robust Remote Sensing + Visual Question Answering + + +
+ Remote Sensing Visual Question Answering (RSVQA) has gained significant +research interest. However, current RSVQA methods are limited by the imaging +mechanisms of optical sensors, particularly under challenging conditions such +as cloud-covered and low-light scenarios. Given the all-time and all-weather +imaging capabilities of Synthetic Aperture Radar (SAR), it is crucial to +investigate the integration of optical-SAR images to improve RSVQA performance. +In this work, we propose a Text-guided Coarse-to-Fine Fusion Network (TGFNet), +which leverages the semantic relationships between question text and +multi-source images to guide the network toward complementary fusion at the +feature level. Specifically, we develop a Text-guided Coarse-to-Fine Attention +Refinement (CFAR) module to focus on key areas related to the question in +complex remote sensing images. This module progressively directs attention from +broad areas to finer details through key region routing, enhancing the model's +ability to focus on relevant regions. Furthermore, we propose an Adaptive +Multi-Expert Fusion (AMEF) module that dynamically integrates different +experts, enabling the adaptive fusion of optical and SAR features. In addition, +we create the first large-scale benchmark dataset for evaluating optical-SAR +RSVQA methods, comprising 6,008 well-aligned optical-SAR image pairs and +1,036,694 well-labeled question-answer pairs across 16 diverse question types, +including complex relational reasoning questions. Extensive experiments on the +proposed dataset demonstrate that our TGFNet effectively integrates +complementary information between optical and SAR images, significantly +improving the model's performance in challenging scenarios. The dataset is +available at: https://github.com/mmic-lcl/. + Index Terms: Remote Sensing Visual Question Answering, Multi-source Data +Fusion, Multimodal, Remote Sensing, OPT-SAR. + +
+
+
+
+
+ + ♻ ☆ AI-Driven Early Mental Health Screening: Analyzing Selfies of Pregnant + Women + + +
+ Major Depressive Disorder and anxiety disorders affect millions globally, +contributing significantly to the burden of mental health issues. Early +screening is crucial for effective intervention, as timely identification of +mental health issues can significantly improve treatment outcomes. Artificial +intelligence (AI) can be valuable for improving the screening of mental +disorders, enabling early intervention and better treatment outcomes. AI-driven +screening can leverage the analysis of multiple data sources, including facial +features in digital images. However, existing methods often rely on controlled +environments or specialized equipment, limiting their broad applicability. This +study explores the potential of AI models for ubiquitous depression-anxiety +screening given face-centric selfies. The investigation focuses on high-risk +pregnant patients, a population that is particularly vulnerable to mental +health issues. To cope with limited training data resulting from our clinical +setup, pre-trained models were utilized in two different approaches: +fine-tuning convolutional neural networks (CNNs) originally designed for facial +expression recognition and employing vision-language models (VLMs) for +zero-shot analysis of facial expressions. Experimental results indicate that +the proposed VLM-based method significantly outperforms CNNs, achieving an +accuracy of 77.6%. Although there is significant room for improvement, the +results suggest that VLMs can be a promising approach for mental health +screening. + +
+
+ comment: This article has been accepted for publication in HEALTHINF25 at the + 18th International Joint Conference on Biomedical Engineering Systems and + Technologies (BIOSTEC 2025) +
+
+
+
+
+ + ♻ ☆ Improving Forward Compatibility in Class Incremental Learning by + Increasing Representation Rank and Feature Richness + + +
+ Class Incremental Learning (CIL) constitutes a pivotal subfield within +continual learning, aimed at enabling models to progressively learn new +classification tasks while retaining knowledge obtained from prior tasks. +Although previous studies have predominantly focused on backward compatible +approaches to mitigate catastrophic forgetting, recent investigations have +introduced forward compatible methods to enhance performance on novel tasks and +complement existing backward compatible methods. In this study, we introduce an +effective-Rank based Feature Richness enhancement (RFR) method, designed for +improving forward compatibility. Specifically, this method increases the +effective rank of representations during the base session, thereby facilitating +the incorporation of more informative features pertinent to unseen novel tasks. +Consequently, RFR achieves dual objectives in backward and forward +compatibility: minimizing feature extractor modifications and enhancing novel +task performance, respectively. To validate the efficacy of our approach, we +establish a theoretical connection between effective rank and the Shannon +entropy of representations. Subsequently, we conduct comprehensive experiments +by integrating RFR into eleven well-known CIL methods. Our results demonstrate +the effectiveness of our approach in enhancing novel-task performance while +mitigating catastrophic forgetting. Furthermore, our method notably improves +the average incremental accuracy across all eleven cases examined. + +
+
+
+
+
+ + ♻ ☆ Benchmarking Counterfactual Image Generation NeurIPS 2024 + + +
+ Generative AI has revolutionised visual content editing, empowering users to +effortlessly modify images and videos. However, not all edits are equal. To +perform realistic edits in domains such as natural image or medical imaging, +modifications must respect causal relationships inherent to the data generation +process. Such image editing falls into the counterfactual image generation +regime. Evaluating counterfactual image generation is substantially complex: +not only it lacks observable ground truths, but also requires adherence to +causal constraints. Although several counterfactual image generation methods +and evaluation metrics exist, a comprehensive comparison within a unified +setting is lacking. We present a comparison framework to thoroughly benchmark +counterfactual image generation methods. We integrate all models that have been +used for the task at hand and expand them to novel datasets and causal graphs, +demonstrating the superiority of Hierarchical VAEs across most datasets and +metrics. Our framework is implemented in a user-friendly Python package that +can be extended to incorporate additional SCMs, causal methods, generative +models, and datasets for the community to build on. Code: +https://github.com/gulnazaki/counterfactual-benchmark. + +
+
+ comment: Published as a conference paper at NeurIPS 2024 Datasets and + Benchmarks Track https://openreview.net/forum?id=0T8xRFrScB Project page: + https://gulnazaki.github.io/counterfactual-benchmark +
+
+
+
+
+ + ♻ ☆ Situational Scene Graph for Structured Human-centric Situation + Understanding + + +
+ Graph based representation has been widely used in modelling spatio-temporal +relationships in video understanding. Although effective, existing graph-based +approaches focus on capturing the human-object relationships while ignoring +fine-grained semantic properties of the action components. These semantic +properties are crucial for understanding the current situation, such as where +does the action takes place, what tools are used and functional properties of +the objects. In this work, we propose a graph-based representation called +Situational Scene Graph (SSG) to encode both human-object relationships and the +corresponding semantic properties. The semantic details are represented as +predefined roles and values inspired by situation frame, which is originally +designed to represent a single action. Based on our proposed representation, we +introduce the task of situational scene graph generation and propose a +multi-stage pipeline Interactive and Complementary Network (InComNet) to +address the task. Given that the existing datasets are not applicable to the +task, we further introduce a SSG dataset whose annotations consist of semantic +role-value frames for human, objects and verb predicates of human-object +relations. Finally, we demonstrate the effectiveness of our proposed SSG +representation by testing on different downstream tasks. Experimental results +show that the unified representation can not only benefit predicate +classification and semantic role-value classification, but also benefit +reasoning tasks on human-centric situation understanding. We will release the +code and the dataset soon. + +
+
+ comment: Accepted for WACV 2025 +
+
+
+
+
+ + ♻ ☆ Multi-Head Explainer: A General Framework to Improve Explainability in + CNNs and Transformers + + +
+ In this study, we introduce the Multi-Head Explainer (MHEX), a versatile and +modular framework that enhances both the explainability and accuracy of +Convolutional Neural Networks (CNNs) and Transformer-based models. MHEX +consists of three core components: an Attention Gate that dynamically +highlights task-relevant features, Deep Supervision that guides early layers to +capture fine-grained details pertinent to the target class, and an Equivalent +Matrix that unifies refined local and global representations to generate +comprehensive saliency maps. Our approach demonstrates superior compatibility, +enabling effortless integration into existing residual networks like ResNet and +Transformer architectures such as BERT with minimal modifications. Extensive +experiments on benchmark datasets in medical imaging and text classification +show that MHEX not only improves classification accuracy but also produces +highly interpretable and detailed saliency scores. + +
+
+
+
+
+ + ♻ ☆ OCTolyzer: Fully automatic toolkit for segmentation and feature + extracting in optical coherence tomography and scanning laser ophthalmoscopy + data + + +
+ Optical coherence tomography (OCT) and scanning laser ophthalmoscopy (SLO) of +the eye has become essential to ophthalmology and the emerging field of +oculomics, thus requiring a need for transparent, reproducible, and rapid +analysis of this data for clinical research and the wider research community. +Here, we introduce OCTolyzer, the first open-source toolkit for retinochoroidal +analysis in OCT/SLO data. It features two analysis suites for OCT and SLO data, +facilitating deep learning-based anatomical segmentation and feature extraction +of the cross-sectional retinal and choroidal layers and en face retinal +vessels. We describe OCTolyzer and evaluate the reproducibility of its OCT +choroid analysis. At the population level, metrics for choroid region thickness +were highly reproducible, with a mean absolute error (MAE)/Pearson correlation +for macular volume choroid thickness (CT) of 6.7$\mu$m/0.99, macular B-scan CT +of 11.6$\mu$m/0.99, and peripapillary CT of 5.0$\mu$m/0.99. Macular choroid +vascular index (CVI) also showed strong reproducibility, with MAE/Pearson for +volume CVI yielding 0.0271/0.97 and B-scan CVI 0.0130/0.91. At the eye level, +measurement noise for regional and vessel metrics was below 5% and 20% of the +population's variability, respectively. Outliers were caused by poor-quality +B-scans with thick choroids and invisible choroid-sclera boundary. Processing +times on a laptop CPU were under three seconds for macular/peripapillary +B-scans and 85 seconds for volume scans. OCTolyzer can convert OCT/SLO data +into reproducible and clinically meaningful retinochoroidal features and will +improve the standardisation of ocular measurements in OCT/SLO image analysis, +requiring no specialised training or proprietary software to be used. OCTolyzer +is freely available here: https://github.com/jaburke166/OCTolyzer. + +
+
+ comment: Main paper: 15 pages, 9 figures, 3 tables. Supplementary material: 9 + pages, 6 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ VibrantVS: A high-resolution multi-task transformer for forest canopy + height estimation + + +
+ This paper explores the application of a novel multi-task vision transformer +(ViT) model for the estimation of canopy height models (CHMs) using 4-band +National Agriculture Imagery Program (NAIP) imagery across the western United +States. We compare the effectiveness of this model in terms of accuracy and +precision aggregated across ecoregions and class heights versus three other +benchmark peer-reviewed models. Key findings suggest that, while other +benchmark models can provide high precision in localized areas, the VibrantVS +model has substantial advantages across a broad reach of ecoregions in the +western United States with higher accuracy, higher precision, the ability to +generate updated inference at a cadence of three years or less, and high +spatial resolution. The VibrantVS model provides significant value for +ecological monitoring and land management decisions for wildfire mitigation. + +
+
+ comment: 15 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ SyncDiff: Synchronized Motion Diffusion for Multi-Body Human-Object + Interaction Synthesis + + +
+ Synthesizing realistic human-object interaction motions is a critical problem +in VR/AR and human animation. Unlike the commonly studied scenarios involving a +single human or hand interacting with one object, we address a more generic +multi-body setting with arbitrary numbers of humans, hands, and objects. This +complexity introduces significant challenges in synchronizing motions due to +the high correlations and mutual influences among bodies. To address these +challenges, we introduce SyncDiff, a novel method for multi-body interaction +synthesis using a synchronized motion diffusion strategy. SyncDiff employs a +single diffusion model to capture the joint distribution of multi-body motions. +To enhance motion fidelity, we propose a frequency-domain motion decomposition +scheme. Additionally, we introduce a new set of alignment scores to emphasize +the synchronization of different body motions. SyncDiff jointly optimizes both +data sample likelihood and alignment likelihood through an explicit +synchronization strategy. Extensive experiments across four datasets with +various multi-body configurations demonstrate the superiority of SyncDiff over +existing state-of-the-art motion synthesis methods. + +
+
+
+
+
+ + ♻ ☆ PSA-VLM: Enhancing Vision-Language Model Safety through Progressive + Concept-Bottleneck-Driven Alignment + + +
+ Benefiting from the powerful capabilities of Large Language Models (LLMs), +pre-trained visual encoder models connected to LLMs form Vision Language Models +(VLMs). However, recent research shows that the visual modality in VLMs is +highly vulnerable, allowing attackers to bypass safety alignment in LLMs +through visually transmitted content, launching harmful attacks. To address +this challenge, we propose a progressive concept-based alignment strategy, +PSA-VLM, which incorporates safety modules as concept bottlenecks to enhance +visual modality safety alignment. By aligning model predictions with specific +safety concepts, we improve defenses against risky images, enhancing +explainability and controllability while minimally impacting general +performance. Our method is obtained through two-stage training. The low +computational cost of the first stage brings very effective performance +improvement, and the fine-tuning of the language model in the second stage +further improves the safety performance. Our method achieves state-of-the-art +results on popular VLM safety benchmark. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2405.13581 +
+
+
+
+
+ + ♻ ☆ Migician: Revealing the Magic of Free-Form Multi-Image Grounding in + Multimodal Large Language Models + + +
+ The recent advancement of Multimodal Large Language Models (MLLMs) has +significantly improved their fine-grained perception of single images and +general comprehension across multiple images. However, existing MLLMs still +face challenges in achieving precise grounding in complex multi-image +scenarios. To address this, we first explore a Chain-of-Thought (CoT) framework +that integrates single-image grounding with multi-image comprehension. While +partially effective, it remains unstable and struggles to capture abstract +visual information due to its non-end-to-end nature. Therefore, we introduce +Migician, the first multi-image grounding model capable of performing free-form +and accurate grounding across multiple images. To support this, we present the +MGrounding-630k dataset, which comprises data for several multi-image grounding +tasks derived from existing datasets, along with newly generated free-form +grounding instruction-following data. Furthermore, we propose MIG-Bench, a +comprehensive benchmark specifically designed for evaluating multi-image +grounding capabilities. Experimental results demonstrate that our model +achieves significantly superior multi-image grounding capabilities, +outperforming the best existing MLLMs by 21.61% and even surpassing much larger +70B models. Our code, model, dataset, and benchmark are fully open-sourced at +https://migician-vg.github.io/. + +
+
+ comment: 20 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Amortizing intractable inference in diffusion models for vision, + language, and control NeurIPS 2024 + + +
+ Diffusion models have emerged as effective distribution estimators in vision, +language, and reinforcement learning, but their use as priors in downstream +tasks poses an intractable posterior inference problem. This paper studies +amortized sampling of the posterior over data, $\mathbf{x}\sim p^{\rm +post}(\mathbf{x})\propto p(\mathbf{x})r(\mathbf{x})$, in a model that consists +of a diffusion generative model prior $p(\mathbf{x})$ and a black-box +constraint or likelihood function $r(\mathbf{x})$. We state and prove the +asymptotic correctness of a data-free learning objective, relative trajectory +balance, for training a diffusion model that samples from this posterior, a +problem that existing methods solve only approximately or in restricted cases. +Relative trajectory balance arises from the generative flow network perspective +on diffusion models, which allows the use of deep reinforcement learning +techniques to improve mode coverage. Experiments illustrate the broad potential +of unbiased inference of arbitrary posteriors under diffusion priors: in vision +(classifier guidance), language (infilling under a discrete diffusion LLM), and +multimodal data (text-to-image generation). Beyond generative modeling, we +apply relative trajectory balance to the problem of continuous control with a +score-based behavior prior, achieving state-of-the-art results on benchmarks in +offline reinforcement learning. + +
+
+ comment: NeurIPS 2024; code: https://github.com/GFNOrg/diffusion-finetuning +
+
+
+
+
+ + ♻ ☆ InstructOCR: Instruction Boosting Scene Text Spotting + + +
+ In the field of scene text spotting, previous OCR methods primarily relied on +image encoders and pre-trained text information, but they often overlooked the +advantages of incorporating human language instructions. To address this gap, +we propose InstructOCR, an innovative instruction-based scene text spotting +model that leverages human language instructions to enhance the understanding +of text within images. Our framework employs both text and image encoders +during training and inference, along with instructions meticulously designed +based on text attributes. This approach enables the model to interpret text +more accurately and flexibly. Extensive experiments demonstrate the +effectiveness of our model and we achieve state-of-the-art results on widely +used benchmarks. Furthermore, the proposed framework can be seamlessly applied +to scene text VQA tasks. By leveraging instruction strategies during +pre-training, the performance on downstream VQA tasks can be significantly +improved, with a 2.6% increase on the TextVQA dataset and a 2.1% increase on +the ST-VQA dataset. These experimental results provide insights into the +benefits of incorporating human language instructions for OCR-related tasks. + +
+
+ comment: Accepted by AAAI2025 +
+
+
+
+
+ + ♻ ☆ II-Bench: An Image Implication Understanding Benchmark for Multimodal + Large Language Models + + +
+ The rapid advancements in the development of multimodal large language models +(MLLMs) have consistently led to new breakthroughs on various benchmarks. In +response, numerous challenging and comprehensive benchmarks have been proposed +to more accurately assess the capabilities of MLLMs. However, there is a dearth +of exploration of the higher-order perceptual capabilities of MLLMs. To fill +this gap, we propose the Image Implication understanding Benchmark, II-Bench, +which aims to evaluate the model's higher-order perception of images. Through +extensive experiments on II-Bench across multiple MLLMs, we have made +significant findings. Initially, a substantial gap is observed between the +performance of MLLMs and humans on II-Bench. The pinnacle accuracy of MLLMs +attains 74.8%, whereas human accuracy averages 90%, peaking at an impressive +98%. Subsequently, MLLMs perform worse on abstract and complex images, +suggesting limitations in their ability to understand high-level semantics and +capture image details. Finally, it is observed that most models exhibit +enhanced accuracy when image sentiment polarity hints are incorporated into the +prompts. This observation underscores a notable deficiency in their inherent +understanding of image sentiment. We believe that II-Bench will inspire the +community to develop the next generation of MLLMs, advancing the journey +towards expert artificial general intelligence (AGI). II-Bench is publicly +available at https://huggingface.co/datasets/m-a-p/II-Bench. + +
+
+ comment: 100 pages, 82 figures, add citations +
+
+
+
+
+ + ♻ ☆ EM-DARTS: Hierarchical Differentiable Architecture Search for Eye + Movement Recognition + + +
+ Eye movement biometrics has received increasing attention thanks to its +highly secure identification. Although deep learning (DL) models have shown +success in eye movement recognition, their architectures largely rely on human +prior knowledge. Differentiable Neural Architecture Search (DARTS) automates +the manual process of architecture design with high search efficiency. However, +DARTS typically stacks multiple cells to form a convolutional network, which +limits the diversity of architecture. Furthermore, DARTS generally searches for +architectures using shallower networks than those used in the evaluation, +creating a significant disparity in architecture depth between the search and +evaluation phases. To address this issue, we propose EM-DARTS, a hierarchical +differentiable architecture search algorithm to automatically design the DL +architecture for eye movement recognition. First, we define a supernet and +propose a global and local alternate Neural Architecture Search method to +search the optimal architecture alternately with a differentiable neural +architecture search. The local search strategy aims to find an optimal +architecture for different cells while the global search strategy is +responsible for optimizing the architecture of the target network. To minimize +redundancy, transfer entropy is proposed to compute the information amount of +each layer, thereby further simplifying the network search process. +Experimental results on three public datasets demonstrate that the proposed +EM-DARTS is capable of producing an optimal architecture that leads to +state-of-the-art recognition performance, {Specifically, the recognition models +developed using EM-DARTS achieved the lowest EERs of 0.0453 on the GazeBase +dataset, 0.0377 on the JuDo1000 dataset, and 0.1385 on the EMglasses dataset. + +
+
+ comment: Submited to IEEE Transactions on Instrumentation and Measurement +
+
+
+
+
+ + ♻ ☆ WeCromCL: Weakly Supervised Cross-Modality Contrastive Learning for + Transcription-only Supervised Text Spotting ECCV 2024 + + +
+ Transcription-only Supervised Text Spotting aims to learn text spotters +relying only on transcriptions but no text boundaries for supervision, thus +eliminating expensive boundary annotation. The crux of this task lies in +locating each transcription in scene text images without location annotations. +In this work, we formulate this challenging problem as a Weakly Supervised +Cross-modality Contrastive Learning problem, and design a simple yet effective +model dubbed WeCromCL that is able to detect each transcription in a scene +image in a weakly supervised manner. Unlike typical methods for cross-modality +contrastive learning that focus on modeling the holistic semantic correlation +between an entire image and a text description, our WeCromCL conducts atomistic +contrastive learning to model the character-wise appearance consistency between +a text transcription and its correlated region in a scene image to detect an +anchor point for the transcription in a weakly supervised manner. The detected +anchor points by WeCromCL are further used as pseudo location labels to guide +the learning of text spotting. Extensive experiments on four challenging +benchmarks demonstrate the superior performance of our model over other +methods. Code will be released. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ♻ ☆ AI-Driven Diabetic Retinopathy Screening: Multicentric Validation of + AIDRSS in India + + +
+ Purpose: Diabetic retinopathy (DR) is a major cause of vision loss, +particularly in India, where access to retina specialists is limited in rural +areas. This study aims to evaluate the Artificial Intelligence-based Diabetic +Retinopathy Screening System (AIDRSS) for DR detection and prevalence +assessment, addressing the growing need for scalable, automated screening +solutions in resource-limited settings. + Approach: A multicentric, cross-sectional study was conducted in Kolkata, +India, involving 5,029 participants and 10,058 macula-centric retinal fundus +images. The AIDRSS employed a deep learning algorithm with 50 million trainable +parameters, integrated with Contrast Limited Adaptive Histogram Equalization +(CLAHE) preprocessing for enhanced image quality. DR was graded using the +International Clinical Diabetic Retinopathy (ICDR) Scale, categorizing disease +into five stages (DR0 to DR4). Statistical metrics including sensitivity, +specificity, and prevalence rates were evaluated against expert retina +specialist assessments. + Results: The prevalence of DR in the general population was 13.7%, rising to +38.2% among individuals with elevated random blood glucose levels. The AIDRSS +achieved an overall sensitivity of 92%, specificity of 88%, and 100% +sensitivity for detecting referable DR (DR3 and DR4). These results demonstrate +the system's robust performance in accurately identifying and grading DR in a +diverse population. + Conclusions: AIDRSS provides a reliable, scalable solution for early DR +detection in resource-constrained environments. Its integration of advanced AI +techniques ensures high diagnostic accuracy, with potential to significantly +reduce the burden of diabetes-related vision loss in underserved regions. + +
+
+ comment: 22 pages, 5 figures. arXiv admin note: substantial text overlap with + arXiv:1812.07105 by other authors without attribution +
+
+
+
+
+ + ♻ ☆ HeadGAP: Few-Shot 3D Head Avatar via Generalizable Gaussian Priors 3DV 2025 + + +
+ In this paper, we present a novel 3D head avatar creation approach capable of +generalizing from few-shot in-the-wild data with high-fidelity and animatable +robustness. Given the underconstrained nature of this problem, incorporating +prior knowledge is essential. Therefore, we propose a framework comprising +prior learning and avatar creation phases. The prior learning phase leverages +3D head priors derived from a large-scale multi-view dynamic dataset, and the +avatar creation phase applies these priors for few-shot personalization. Our +approach effectively captures these priors by utilizing a Gaussian +Splatting-based auto-decoder network with part-based dynamic modeling. Our +method employs identity-shared encoding with personalized latent codes for +individual identities to learn the attributes of Gaussian primitives. During +the avatar creation phase, we achieve fast head avatar personalization by +leveraging inversion and fine-tuning strategies. Extensive experiments +demonstrate that our model effectively exploits head priors and successfully +generalizes them to few-shot personalization, achieving photo-realistic +rendering quality, multi-view consistency, and stable animation. + +
+
+ comment: Accepted to 3DV 2025. Project page: https://headgap.github.io/ +
+
+
+
+
+ + ♻ ☆ Quilt-LLaVA: Visual Instruction Tuning by Extracting Localized + Narratives from Open-Source Histopathology Videos + + +
+ Diagnosis in histopathology requires a global whole slide images (WSIs) +analysis, requiring pathologists to compound evidence from different WSI +patches. The gigapixel scale of WSIs poses a challenge for histopathology +multi-modal models. Training multi-model models for histopathology requires +instruction tuning datasets, which currently contain information for individual +image patches, without a spatial grounding of the concepts within each patch +and without a wider view of the WSI. Therefore, they lack sufficient diagnostic +capacity for histopathology. To bridge this gap, we introduce Quilt-Instruct, a +large-scale dataset of 107,131 histopathology-specific instruction +question/answer pairs, grounded within diagnostically relevant image patches +that make up the WSI. Our dataset is collected by leveraging educational +histopathology videos from YouTube, which provides spatial localization of +narrations by automatically extracting the narrators' cursor positions. +Quilt-Instruct supports contextual reasoning by extracting diagnosis and +supporting facts from the entire WSI. Using Quilt-Instruct, we train +Quilt-LLaVA, which can reason beyond the given single image patch, enabling +diagnostic reasoning across patches. To evaluate Quilt-LLaVA, we propose a +comprehensive evaluation dataset created from 985 images and 1283 +human-generated question-answers. We also thoroughly evaluate Quilt-LLaVA using +public histopathology datasets, where Quilt-LLaVA significantly outperforms +SOTA by over 10% on relative GPT-4 score and 4% and 9% on open and closed set +VQA. Our code, data, and model are publicly accessible at +quilt-llava.github.io. + +
+
+
+
+
+ + ♻ ☆ Simplifying CLIP: Unleashing the Power of Large-Scale Models on + Consumer-level Computers + + +
+ Contrastive Language-Image Pre-training (CLIP) has attracted a surge of +attention for its superior zero-shot performance and excellent transferability +to downstream tasks. However, training such large-scale models usually requires +substantial computation and storage, which poses barriers for general users +with consumer-level computers. Motivated by this observation, in this paper we +investigate how to achieve competitive performance on only one Nvidia RTX3090 +GPU and with one terabyte for storing dataset. On one hand, we simplify the +transformer block structure and combine Weight Inheritance with multi-stage +Knowledge Distillation (WIKD), thereby reducing the parameters and improving +the inference speed during training along with deployment. On the other hand, +confronted with the convergence challenge posed by small dataset, we generate +synthetic captions for each sample as data augmentation, and devise a novel +Pair Matching (PM) loss to fully exploit the distinguishment among positive and +negative image-text pairs. Extensive experiments demonstrate that our model can +achieve a new state-of-the-art datascale-parameter-accuracy tradeoff, which +could further popularize the CLIP model in the related research community. + +
+
+
+
+
+ + ♻ ☆ Buster: Implanting Semantic Backdoor into Text Encoder to Mitigate NSFW + Content Generation + + +
+ The rise of deep learning models in the digital era has raised substantial +concerns regarding the generation of Not-Safe-for-Work (NSFW) content. Existing +defense methods primarily involve model fine-tuning and post-hoc content +moderation. Nevertheless, these approaches largely lack scalability in +eliminating harmful content, degrade the quality of benign image generation, or +incur high inference costs. To address these challenges, we propose an +innovative framework named \textit{Buster}, which injects backdoors into the +text encoder to prevent NSFW content generation. Buster leverages deep semantic +information rather than explicit prompts as triggers, redirecting NSFW prompts +towards targeted benign prompts. Additionally, Buster employs energy-based +training data generation through Langevin dynamics for adversarial knowledge +augmentation, thereby ensuring robustness in harmful concept definition. This +approach demonstrates exceptional resilience and scalability in mitigating NSFW +content. Particularly, Buster fine-tunes the text encoder of Text-to-Image +models within merely five minutes, showcasing its efficiency. Our extensive +experiments denote that Buster outperforms nine state-of-the-art baselines, +achieving a superior NSFW content removal rate of at least 91.2\% while +preserving the quality of harmless images. + +
+
+
+
+
+ + ♻ ☆ On the Robustness of Object Detection Models on Aerial Images + + +
+ The robustness of object detection models is a major concern when applied to +real-world scenarios. The performance of most models tends to degrade when +confronted with images affected by corruptions, since they are usually trained +and evaluated on clean datasets. While numerous studies have explored the +robustness of object detection models on natural images, there is a paucity of +research focused on models applied to aerial images, which feature complex +backgrounds, substantial variations in scales, and orientations of objects. +This paper addresses the challenge of assessing the robustness of object +detection models on aerial images, with a specific emphasis on scenarios where +images are affected by clouds. In this study, we introduce two novel benchmarks +based on DOTA-v1.0. The first benchmark encompasses 19 prevalent corruptions, +while the second focuses on the cloud-corrupted condition-a phenomenon uncommon +in natural images yet frequent in aerial photography. We systematically +evaluate the robustness of mainstream object detection models and perform +necessary ablation experiments. Through our investigations, we find that +rotation-invariant modeling and enhanced backbone architectures can improve the +robustness of models. Furthermore, increasing the capacity of Transformer-based +backbones can strengthen their robustness. The benchmarks we propose and our +comprehensive experimental analyses can facilitate research on robust object +detection on aerial images. The codes and datasets are available at: +https://github.com/hehaodong530/DOTA-C. + +
+
+ comment: accepted by IEEE TGRS +
+
+
+
+
+ + ♻ ☆ Pamba: Enhancing Global Interaction in Point Clouds via State Space + Model + + +
+ Transformers have demonstrated impressive results for 3D point cloud semantic +segmentation. However, the quadratic complexity of transformer makes +computation costs high, limiting the number of points that can be processed +simultaneously and impeding the modeling of long-range dependencies between +objects in a single scene. Drawing inspiration from the great potential of +recent state space models (SSM) for long sequence modeling, we introduce Mamba, +an SSM-based architecture, to the point cloud domain and propose Pamba, a novel +architecture with strong global modeling capability under linear complexity. +Specifically, to make the disorderness of point clouds fit in with the causal +nature of Mamba, we propose a multi-path serialization strategy applicable to +point clouds. Besides, we propose the ConvMamba block to compensate for the +shortcomings of Mamba in modeling local geometries and in unidirectional +modeling. Pamba obtains state-of-the-art results on several 3D point cloud +segmentation tasks, including ScanNet v2, ScanNet200, S3DIS and nuScenes, while +its effectiveness is validated by extensive experiments. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ♻ ☆ MovieCharacter: A Tuning-Free Framework for Controllable Character Video + Synthesis + + +
+ Recent advancements in character video synthesis still depend on extensive +fine-tuning or complex 3D modeling processes, which can restrict accessibility +and hinder real-time applicability. To address these challenges, we propose a +simple yet effective tuning-free framework for character video synthesis, named +MovieCharacter, designed to streamline the synthesis process while ensuring +high-quality outcomes. Our framework decomposes the synthesis task into +distinct, manageable modules: character segmentation and tracking, video object +removal, character motion imitation, and video composition. This modular design +not only facilitates flexible customization but also ensures that each +component operates collaboratively to effectively meet user needs. By +leveraging existing open-source models and integrating well-established +techniques, MovieCharacter achieves impressive synthesis results without +necessitating substantial resources or proprietary datasets. Experimental +results demonstrate that our framework enhances the efficiency, accessibility, +and adaptability of character video synthesis, paving the way for broader +creative and interactive applications. + +
+
+
+
+
+ + ♻ ☆ MLLM-CompBench: A Comparative Reasoning Benchmark for Multimodal LLMs NeurIPS 2024 + + +
+ The ability to compare objects, scenes, or situations is crucial for +effective decision-making and problem-solving in everyday life. For instance, +comparing the freshness of apples enables better choices during grocery +shopping while comparing sofa designs helps optimize the aesthetics of our +living space. Despite its significance, the comparative capability is largely +unexplored in artificial general intelligence (AGI). In this paper, we +introduce MLLM-CompBench, a benchmark designed to evaluate the comparative +reasoning capability of multimodal large language models (MLLMs). +MLLM-CompBench mines and pairs images through visually oriented questions +covering eight dimensions of relative comparison: visual attribute, existence, +state, emotion, temporality, spatiality, quantity, and quality. We curate a +collection of around 40K image pairs using metadata from diverse vision +datasets and CLIP similarity scores. These image pairs span a broad array of +visual domains, including animals, fashion, sports, and both outdoor and indoor +scenes. The questions are carefully crafted to discern relative characteristics +between two images and are labeled by human annotators for accuracy and +relevance. We use MLLM-CompBench to evaluate recent MLLMs, including +GPT-4V(ision), Gemini-Pro, and LLaVA-1.6. Our results reveal notable +shortcomings in their comparative abilities. We believe MLLM-COMPBENCH not only +sheds light on these limitations but also establishes a solid foundation for +future enhancements in the comparative capability of MLLMs. + +
+
+ comment: This paper has been accepted to NeurIPS 2024. The first two authors + contributed equally to this work +
+
+
+
+
+ + ♻ ☆ SL-YOLO: A Stronger and Lighter Drone Target Detection Model + + +
+ Detecting small objects in complex scenes, such as those captured by drones, +is a daunting challenge due to the difficulty in capturing the complex features +of small targets. While the YOLO family has achieved great success in large +target detection, its performance is less than satisfactory when faced with +small targets. Because of this, this paper proposes a revolutionary model +SL-YOLO (Stronger and Lighter YOLO) that aims to break the bottleneck of small +target detection. We propose the Hierarchical Extended Path Aggregation Network +(HEPAN), a pioneering cross-scale feature fusion method that can ensure +unparalleled detection accuracy even in the most challenging environments. At +the same time, without sacrificing detection capabilities, we design the C2fDCB +lightweight module and add the SCDown downsampling module to greatly reduce the +model's parameters and computational complexity. Our experimental results on +the VisDrone2019 dataset reveal a significant improvement in performance, with +mAP@0.5 jumping from 43.0% to 46.9% and mAP@0.5:0.95 increasing from 26.0% to +28.9%. At the same time, the model parameters are reduced from 11.1M to 9.6M, +and the FPS can reach 132, making it an ideal solution for real-time small +object detection in resource-constrained environments. + +
+
+
+
+
+ + ♻ ☆ SoftPatch+: Fully Unsupervised Anomaly Classification and Segmentation + + +
+ Although mainstream unsupervised anomaly detection (AD) (including +image-level classification and pixel-level segmentation)algorithms perform well +in academic datasets, their performance is limited in practical application due +to the ideal experimental setting of clean training data. Training with noisy +data is an inevitable problem in real-world anomaly detection but is seldom +discussed. This paper is the first to consider fully unsupervised industrial +anomaly detection (i.e., unsupervised AD with noisy data). To solve this +problem, we proposed memory-based unsupervised AD methods, SoftPatch and +SoftPatch+, which efficiently denoise the data at the patch level. Noise +discriminators are utilized to generate outlier scores for patch-level noise +elimination before coreset construction. The scores are then stored in the +memory bank to soften the anomaly detection boundary. Compared with existing +methods, SoftPatch maintains a strong modeling ability of normal data and +alleviates the overconfidence problem in coreset, and SoftPatch+ has more +robust performance which is articularly useful in real-world industrial +inspection scenarios with high levels of noise (from 10% to 40%). Comprehensive +experiments conducted in diverse noise scenarios demonstrate that both +SoftPatch and SoftPatch+ outperform the state-of-the-art AD methods on the +MVTecAD, ViSA, and BTAD benchmarks. Furthermore, the performance of SoftPatch +and SoftPatch+ is comparable to that of the noise-free methods in conventional +unsupervised AD setting. The code of the proposed methods can be found at +https://github.com/TencentYoutuResearch/AnomalyDetection-SoftPatch. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2403.14233 + paper has been accepted by Pattern Recognition +
+
+
+
+
+ + ♻ ☆ MedicalNarratives: Connecting Medical Vision and Language with Localized + Narratives + + +
+ We propose MedicalNarratives, a dataset curated from medical pedagogical +videos similar in nature to data collected in Think-Aloud studies and inspired +by Localized Narratives, which collects grounded image-text data by curating +instructors' speech and mouse cursor movements synchronized in time. +MedicalNarratives enables pretraining of both semantic and dense objectives, +alleviating the need to train medical semantic and dense tasks disparately due +to the lack of reasonably sized datasets. Our dataset contains 4.7M image-text +pairs from videos and articles, with 1M samples containing dense annotations in +the form of traces and bounding boxes. To evaluate the utility of +MedicalNarratives, we train GenMedClip based on the CLIP architecture using our +dataset spanning 12 medical domains and demonstrate that it outperforms +previous state-of-the-art models on a newly constructed medical imaging +benchmark that comprehensively evaluates performance across all modalities. +Data, demo, code and models available at https://medical-narratives.github.io + +
+
+
+
+
+ + ♻ ☆ Images are Achilles' Heel of Alignment: Exploiting Visual + Vulnerabilities for Jailbreaking Multimodal Large Language Models ECCV 2024 + + +
+ In this paper, we study the harmlessness alignment problem of multimodal +large language models (MLLMs). We conduct a systematic empirical analysis of +the harmlessness performance of representative MLLMs and reveal that the image +input poses the alignment vulnerability of MLLMs. Inspired by this, we propose +a novel jailbreak method named HADES, which hides and amplifies the harmfulness +of the malicious intent within the text input, using meticulously crafted +images. Experimental results show that HADES can effectively jailbreak existing +MLLMs, which achieves an average Attack Success Rate (ASR) of 90.26% for +LLaVA-1.5 and 71.60% for Gemini Pro Vision. Our code and data are available at +https://github.com/RUCAIBox/HADES. + +
+
+ comment: ECCV 2024 Oral +
+
+
+
+
+ + ♻ ☆ The Streetscape Application Services Stack (SASS): Towards a Distributed + Sensing Architecture for Urban Applications + + +
+ As urban populations grow, cities are becoming more complex, driving the +deployment of interconnected sensing systems to realize the vision of smart +cities. These systems aim to improve safety, mobility, and quality of life +through applications that integrate diverse sensors with real-time +decision-making. Streetscape applications-focusing on challenges like +pedestrian safety and adaptive traffic management-depend on managing +distributed, heterogeneous sensor data, aligning information across time and +space, and enabling real-time processing. These tasks are inherently complex +and often difficult to scale. The Streetscape Application Services Stack (SASS) +addresses these challenges with three core services: multimodal data +synchronization, spatiotemporal data fusion, and distributed edge computing. By +structuring these capabilities as clear, composable abstractions with clear +semantics, SASS allows developers to scale streetscape applications efficiently +while minimizing the complexity of multimodal integration. + We evaluated SASS in two real-world testbed environments: a controlled +parking lot and an urban intersection in a major U.S. city. These testbeds +allowed us to test SASS under diverse conditions, demonstrating its practical +applicability. The Multimodal Data Synchronization service reduced temporal +misalignment errors by 88%, achieving synchronization accuracy within 50 +milliseconds. Spatiotemporal Data Fusion service improved detection accuracy +for pedestrians and vehicles by over 10%, leveraging multicamera integration. +The Distributed Edge Computing service increased system throughput by more than +an order of magnitude. Together, these results show how SASS provides the +abstractions and performance needed to support real-time, scalable urban +applications, bridging the gap between sensing infrastructure and actionable +streetscape intelligence. + +
+
+
+
+
+ + ♻ ☆ Valley2: Exploring Multimodal Models with Scalable Vision-Language + Design + + +
+ Recently, vision-language models have made remarkable progress, demonstrating +outstanding capabilities in various tasks such as image captioning and video +understanding. We introduce Valley2, a novel multimodal large language model +designed to enhance performance across all domains and extend the boundaries of +practical applications in e-commerce and short video scenarios. Notably, +Valley2 achieves state-of-the-art (SOTA) performance on e-commerce benchmarks, +surpassing open-source models of similar size by a large margin (79.66 vs. +72.76). Additionally, Valley2 ranks second on the OpenCompass leaderboard among +models with fewer than 10B parameters, with an impressive average score of +67.4. The code and model weights are open-sourced at +https://github.com/bytedance/Valley. + +
+
+
+
+
+ + ♻ ☆ ChartX & ChartVLM: A Versatile Benchmark and Foundation Model for + Complicated Chart Reasoning + + +
+ Recently, many versatile Multi-modal Large Language Models (MLLMs) have +emerged continuously. However, their capacity to query information depicted in +visual charts and engage in reasoning based on the queried contents remains +under-explored. In this paper, to comprehensively and rigorously benchmark the +ability of the off-the-shelf MLLMs in the chart domain, we construct ChartX, a +multi-modal evaluation set covering 18 chart types, 7 chart tasks, 22 +disciplinary topics, and high-quality chart data. Besides, we develop ChartVLM +to offer a new perspective on handling multi-modal tasks that strongly depend +on interpretable patterns, such as reasoning tasks in the field of charts or +geometric images. We evaluate the chart-related ability of mainstream MLLMs and +our ChartVLM on the proposed ChartX evaluation set. Extensive experiments +demonstrate that ChartVLM surpasses both versatile and chart-related large +models, achieving results comparable to GPT-4V. We believe that our study can +pave the way for further exploration in creating a more comprehensive chart +evaluation set and developing more interpretable multi-modal models. Both +ChartX and ChartVLM are available at: +https://github.com/Alpha-Innovator/ChartVLM + +
+
+ comment: Code and dataset are available for downloading at: + https://github.com/Alpha-Innovator/ChartVLM 25 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ LDMapNet-U: An End-to-End System for City-Scale Lane-Level Map Updating + + +
+ An up-to-date city-scale lane-level map is an indispensable infrastructure +and a key enabling technology for ensuring the safety and user experience of +autonomous driving systems. In industrial scenarios, reliance on manual +annotation for map updates creates a critical bottleneck. Lane-level updates +require precise change information and must ensure consistency with adjacent +data while adhering to strict standards. Traditional methods utilize a +three-stage approach-construction, change detection, and updating-which often +necessitates manual verification due to accuracy limitations. This results in +labor-intensive processes and hampers timely updates. To address these +challenges, we propose LDMapNet-U, which implements a new end-to-end paradigm +for city-scale lane-level map updating. By reconceptualizing the update task as +an end-to-end map generation process grounded in historical map data, we +introduce a paradigm shift in map updating that simultaneously generates +vectorized maps and change information. To achieve this, a Prior-Map Encoding +(PME) module is introduced to effectively encode historical maps, serving as a +critical reference for detecting changes. Additionally, we incorporate a novel +Instance Change Prediction (ICP) module that learns to predict associations +with historical maps. Consequently, LDMapNet-U simultaneously achieves +vectorized map element generation and change detection. To demonstrate the +superiority and effectiveness of LDMapNet-U, extensive experiments are +conducted using large-scale real-world datasets. In addition, LDMapNet-U has +been successfully deployed in production at Baidu Maps since April 2024, +supporting map updating for over 360 cities and significantly shortening the +update cycle from quarterly to weekly. The updated maps serve hundreds of +millions of users and are integrated into the autonomous driving systems of +several leading vehicle companies. + +
+
+ comment: Accepted by KDD 2025, camera-ready version +
+
+
+
+
+ + ♻ ☆ PViT: Prior-augmented Vision Transformer for Out-of-distribution + Detection + + +
+ Vision Transformers (ViTs) have achieved remarkable success over various +vision tasks, yet their robustness against data distribution shifts and +inherent inductive biases remain underexplored. To enhance the robustness of +ViT models for image Out-of-Distribution (OOD) detection, we introduce a novel +and generic framework named Prior-augmented Vision Transformer (PViT). Taking +as input the prior class logits from a pretrained model, we train PViT to +predict the class logits. During inference, PViT identifies OOD samples by +quantifying the divergence between the predicted class logits and the prior +logits obtained from pre-trained models. Unlike existing state-of-the-art(SOTA) +OOD detection methods, PViT shapes the decision boundary between ID and OOD by +utilizing the proposed prior guided confidence, without requiring additional +data modeling, generation methods, or structural modifications. Extensive +experiments on the large-scale ImageNet benchmark, evaluated against over seven +OOD datasets, demonstrate that PViT significantly outperforms existing SOTA OOD +detection methods in terms of FPR95 and AUROC. The codebase is publicly +available at https://github.com/RanchoGoose/PViT. + +
+
+
+
+
+ + ♻ ☆ Sparse Attention Vectors: Generative Multimodal Model Features Are + Discriminative Vision-Language Classifiers + + +
+ Generative Large Multimodal Models (LMMs) like LLaVA and Qwen-VL excel at a +wide variety of vision-language (VL) tasks such as image captioning or visual +question answering. Despite strong performance, LMMs are not directly suited +for foundational discriminative vision-language tasks (i.e., tasks requiring +discrete label predictions) such as image classification and multiple-choice +VQA. One key challenge in utilizing LMMs for discriminative tasks is the +extraction of useful features from generative models. To overcome this issue, +we propose an approach for finding features in the model's latent space to more +effectively leverage LMMs for discriminative tasks. Toward this end, we present +Sparse Attention Vectors (SAVs) -- a finetuning-free method that leverages +sparse attention head activations (fewer than 1\% of the heads) in LMMs as +strong features for VL tasks. With only few-shot examples, SAVs demonstrate +state-of-the-art performance compared to a variety of few-shot and finetuned +baselines on a collection of discriminative tasks. Our experiments also imply +that SAVs can scale in performance with additional examples and generalize to +similar tasks, establishing SAVs as both effective and robust multimodal +feature representations. + +
+
+
+
+
+ + ♻ ☆ Pre-trained Vision-Language Models Learn Discoverable Visual Concepts + + +
+ Do vision-language models (VLMs) pre-trained to caption an image of a +"durian" learn visual concepts such as "brown" (color) and "spiky" (texture) at +the same time? We aim to answer this question as visual concepts learned "for +free" would enable wide applications such as neuro-symbolic reasoning or +human-interpretable object classification. We assume that the visual concepts, +if captured by pre-trained VLMs, can be extracted by their vision-language +interface with text-based concept prompts. We observe that recent works +prompting VLMs with concepts often differ in their strategies to define and +evaluate the visual concepts, leading to conflicting conclusions. We propose a +new concept definition strategy based on two observations: First, certain +concept prompts include shortcuts that recognize correct concepts for wrong +reasons; Second, multimodal information (e.g. visual discriminativeness, and +textual knowledge) should be leveraged when selecting the concepts. Our +proposed concept discovery and learning (CDL) framework is thus designed to +identify a diverse list of generic visual concepts (e.g. "spiky" as opposed to +"spiky durian"), which are ranked and selected based on visual and language +mutual information. We carefully design quantitative and human evaluations of +the discovered concepts on six diverse visual recognition datasets, which +confirm that pre-trained VLMs do learn visual concepts that provide accurate +and thorough descriptions for the recognized objects. All code and models are +publicly released. + +
+
+ comment: Transactions on Machine Learning Research, 2025 +
+
+
+
+
+ + ♻ ☆ Extracting Manifold Information from Point Clouds + + +
+ A kernel based method is proposed for the construction of signature +(defining) functions of subsets of $\mathbb{R}^d$. The subsets can range from +full dimensional manifolds (open subsets) to point clouds (a finite number of +points) and include bounded smooth manifolds of any codimension. The +interpolation and analysis of point clouds are the main application. Two +extreme cases in terms of regularity are considered, where the data set is +interpolated by an analytic surface, at the one extreme, and by a H\"older +continuous surface, at the other. The signature function can be computed as a +linear combination of translated kernels, the coefficients of which are the +solution of a finite dimensional linear problem. Once it is obtained, it can be +used to estimate the dimension as well as the normal and the curvatures of the +interpolated surface. The method is global and does not require explicit +knowledge of local neighborhoods or any other structure present in the data +set. It admits a variational formulation with a natural ``regularized'' +counterpart, that proves to be useful in dealing with data sets corrupted by +numerical error or noise. The underlying analytical structure of the approach +is presented in general before it is applied to the case of point clouds. + +
+
+ comment: 27 pages, 16 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ ExACT: Teaching AI Agents to Explore with Reflective-MCTS and + Exploratory Learning + + +
+ Autonomous agents have demonstrated significant potential in automating +complex multistep decision-making tasks. However, even state-of-the-art +vision-language models (VLMs), such as GPT-4o, still fall short of human-level +performance, particularly in intricate web environments and long-horizon tasks. +To address these limitations, we present ExACT, an approach to combine +test-time search and self-learning to build o1-like models for agentic +applications. We first introduce Reflective Monte Carlo Tree Search (R-MCTS), a +novel test time algorithm designed to enhance AI agents' ability to explore +decision space on the fly. R-MCTS extends traditional MCTS by 1) incorporating +contrastive reflection, allowing agents to learn from past interactions and +dynamically improve their search efficiency; and 2) using multi-agent debate +for reliable state evaluation. Next, we introduce Exploratory Learning, a novel +learning strategy to teach agents to search at inference time without relying +on any external search algorithms. On the challenging VisualWebArena benchmark, +our GPT-4o based R-MCTS agent achieves a 6% to 30% relative improvement across +various tasks compared to the previous state-of-the-art. Additionally, we show +that the knowledge and experience gained from test-time search can be +effectively transferred back to GPT-4o via fine-tuning. After Exploratory +Learning, GPT-4o 1) demonstrates the ability to explore the environment, +evaluate a state, and backtrack to viable ones when it detects that the current +state cannot lead to success, and 2) matches 87% of R-MCTS's performance while +using significantly less compute. Notably, our work demonstrates the compute +scaling properties in both training - data collection with R-MCTS - and testing +time. These results suggest a promising research direction to enhance VLMs' +capabilities for agentic applications via test-time search and self-learning. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 17 + +
+
+
+ + ☆ Learning Implicit Social Navigation Behavior using Deep Inverse + Reinforcement Learning + + +
+ This paper reports on learning a reward map for social navigation in dynamic +environments where the robot can reason about its path at any time, given +agents' trajectories and scene geometry. Humans navigating in dense and dynamic +indoor environments often work with several implied social rules. A rule-based +approach fails to model all possible interactions between humans, robots, and +scenes. We propose a novel Smooth Maximum Entropy Deep Inverse Reinforcement +Learning (S-MEDIRL) algorithm that can extrapolate beyond expert demos to +better encode scene navigability from few-shot demonstrations. The agent learns +to predict the cost maps reasoning on trajectory data and scene geometry. The +agent samples a trajectory that is then executed using a local crowd navigation +controller. We present results in a photo-realistic simulation environment, +with a robot and a human navigating a narrow crossing scenario. The robot +implicitly learns to exhibit social behaviors such as yielding to oncoming +traffic and avoiding deadlocks. We compare the proposed approach to the popular +model-based crowd navigation algorithm ORCA and a rule-based agent that +exhibits yielding. + +
+
+ comment: 8 pages, Submitted to IEEE Robotics and Automation Letters (RAL) +
+
+
+
+
+ + ☆ Shake-VLA: Vision-Language-Action Model-Based System for Bimanual + Robotic Manipulations and Liquid Mixing + + +
+ This paper introduces Shake-VLA, a Vision-Language-Action (VLA) model-based +system designed to enable bimanual robotic manipulation for automated cocktail +preparation. The system integrates a vision module for detecting ingredient +bottles and reading labels, a speech-to-text module for interpreting user +commands, and a language model to generate task-specific robotic instructions. +Force Torque (FT) sensors are employed to precisely measure the quantity of +liquid poured, ensuring accuracy in ingredient proportions during the mixing +process. The system architecture includes a Retrieval-Augmented Generation +(RAG) module for accessing and adapting recipes, an anomaly detection mechanism +to address ingredient availability issues, and bimanual robotic arms for +dexterous manipulation. Experimental evaluations demonstrated a high success +rate across system components, with the speech-to-text module achieving a 93% +success rate in noisy environments, the vision module attaining a 91% success +rate in object and label detection in cluttered environment, the anomaly module +successfully identified 95% of discrepancies between detected ingredients and +recipe requirements, and the system achieved an overall success rate of 100% in +preparing cocktails, from recipe formulation to action generation. + +
+
+ comment: Accepted to IEEE/ACM HRI 2025 +
+
+
+
+
+ + ☆ From Simulation to Field: Learning Terrain Traversability for Real-World + Deployment + + +
+ The challenge of traversability estimation is a crucial aspect of autonomous +navigation in unstructured outdoor environments such as forests. It involves +determining whether certain areas are passable or risky for robots, taking into +account factors like terrain irregularities, slopes, and potential obstacles. +The majority of current methods for traversability estimation operate on the +assumption of an offline computation, overlooking the significant influence of +the robot's heading direction on accurate traversability estimates. In this +work, we introduce a deep neural network that uses detailed geometric +environmental data together with the robot's recent movement characteristics. +This fusion enables the generation of robot direction awareness and continuous +traversability estimates, essential for enhancing robot autonomy in challenging +terrains like dense forests. The efficacy and significance of our approach are +underscored by experiments conducted on both simulated and real robotic +platforms in various environments, yielding quantitatively superior performance +results compared to existing methods. Moreover, we demonstrate that our method, +trained exclusively in a high-fidelity simulated setting, can accurately +predict traversability in real-world applications without any real data +collection. Our experiments showcase the advantages of our method for +optimizing path-planning and exploration tasks within difficult outdoor +environments, underscoring its practicality for effective, real-world robotic +navigation. In the spirit of collaborative advancement, we have made the code +implementation available to the public. + +
+
+ comment: 38 pages +
+
+
+
+
+ + ☆ ActiveGAMER: Active GAussian Mapping through Efficient Rendering + + +
+ We introduce ActiveGAMER, an active mapping system that utilizes 3D Gaussian +Splatting (3DGS) to achieve high-quality, real-time scene mapping and +exploration. Unlike traditional NeRF-based methods, which are computationally +demanding and restrict active mapping performance, our approach leverages the +efficient rendering capabilities of 3DGS, allowing effective and efficient +exploration in complex environments. The core of our system is a +rendering-based information gain module that dynamically identifies the most +informative viewpoints for next-best-view planning, enhancing both geometric +and photometric reconstruction accuracy. ActiveGAMER also integrates a +carefully balanced framework, combining coarse-to-fine exploration, +post-refinement, and a global-local keyframe selection strategy to maximize +reconstruction completeness and fidelity. Our system autonomously explores and +reconstructs environments with state-of-the-art geometric and photometric +accuracy and completeness, significantly surpassing existing approaches in both +aspects. Extensive evaluations on benchmark datasets such as Replica and MP3D +highlight ActiveGAMER's effectiveness in active mapping tasks. + +
+
+
+
+
+ + ☆ Toward a Universal Concept of Artificial Personality: Implementing + Robotic Personality in a Kinova Arm + + +
+ The fundamental role of personality in shaping interactions is increasingly +being exploited in robotics. A carefully designed robotic personality has been +shown to improve several key aspects of Human-Robot Interaction (HRI). However, +the fragmentation and rigidity of existing approaches reveal even greater +challenges when applied to non-humanoid robots. On one hand, the state of the +art is very dispersed; on the other hand, Industry 4.0 is moving towards a +future where humans and industrial robots are going to coexist. In this +context, the proper design of a robotic personality can lead to more successful +interactions. This research takes a first step in that direction by integrating +a comprehensive cognitive architecture built upon the definition of robotic +personality - validated on humanoid robots - into a robotic Kinova Jaco2 arm. +The robot personality is defined through the cognitive architecture as a vector +in the three-dimensional space encompassing Conscientiousness, Extroversion, +and Agreeableness, affecting how actions are executed, the action selection +process, and the internal reaction to environmental stimuli. Our main objective +is to determine whether users perceive distinct personalities in the robot, +regardless of its shape, and to understand the role language plays in shaping +these perceptions. To achieve this, we conducted a user study comprising 144 +sessions of a collaborative game between a Kinova Jaco2 arm and participants, +where the robot's behavior was influenced by its assigned personality. +Furthermore, we compared two conditions: in the first, the robot communicated +solely through gestures and action choices, while in the second, it also +utilized verbal interaction. + +
+
+
+
+
+ + ☆ Accelerating Discovery in Natural Science Laboratories with AI and + Robotics: Perspectives and Challenges from the 2024 IEEE ICRA Workshop, + Yokohama, Japan + + +
+ Science laboratory automation enables accelerated discovery in life sciences +and materials. However, it requires interdisciplinary collaboration to address +challenges such as robust and flexible autonomy, reproducibility, throughput, +standardization, the role of human scientists, and ethics. This article +highlights these issues, reflecting perspectives from leading experts in +laboratory automation across different disciplines of the natural sciences. + +
+
+
+
+
+ + ☆ Soft Vision-Based Tactile-Enabled SixthFinger: Advancing Daily Objects + Manipulation for Stroke Survivors + + +
+ The presence of post-stroke grasping deficiencies highlights the critical +need for the development and implementation of advanced compensatory +strategies. This paper introduces a novel system to aid chronic stroke +survivors through the development of a soft, vision-based, tactile-enabled +extra robotic finger. By incorporating vision-based tactile sensing, the system +autonomously adjusts grip force in response to slippage detection. This synergy +not only ensures mechanical stability but also enriches tactile feedback, +mimicking the dynamics of human-object interactions. At the core of our +approach is a transformer-based framework trained on a comprehensive tactile +dataset encompassing objects with a wide range of morphological properties, +including variations in shape, size, weight, texture, and hardness. +Furthermore, we validated the system's robustness in real-world applications, +where it successfully manipulated various everyday objects. The promising +results highlight the potential of this approach to improve the quality of life +for stroke survivors. + +
+
+ comment: Robosoft 2025 conference +
+
+
+
+
+ + ☆ Cost-Effective Robotic Handwriting System with AI Integration + + +
+ This paper introduces a cost-effective robotic handwriting system designed to +replicate human-like handwriting with high precision. Combining a Raspberry Pi +Pico microcontroller, 3D-printed components, and a machine learning-based +handwriting generation model implemented via TensorFlow.js, the system converts +user-supplied text into realistic stroke trajectories. By leveraging +lightweight 3D-printed materials and efficient mechanical designs, the system +achieves a total hardware cost of approximately \$56, significantly +undercutting commercial alternatives. Experimental evaluations demonstrate +handwriting precision within $\pm$0.3 millimeters and a writing speed of +approximately 200 mm/min, positioning the system as a viable solution for +educational, research, and assistive applications. This study seeks to lower +the barriers to personalized handwriting technologies, making them accessible +to a broader audience. + +
+
+ comment: This is an updated version of a paper originally presented at the + 2024 IEEE Long Island Systems, Applications and Technology Conference (LISAT) +
+
+
+
+
+ + ☆ Hierarchical Sampling-based Planner with LTL Constraints and Text + Prompting + + +
+ This project introduces a hierarchical planner integrating Linear Temporal +Logic (LTL) constraints with natural language prompting for robot motion +planning. The framework decomposes maps into regions, generates directed +graphs, and converts them into transition systems for high-level planning. Text +instructions are translated into LTL formulas and converted to Deterministic +Finite Automata (DFA) for sequential goal-reaching tasks while adhering to +safety constraints. High-level plans, derived via Breadth-First Search (BFS), +guide low-level planners like Exploring Random Trees (RRT) and Probabilistic +Roadmaps (PRM) for obstacle-avoidant navigation along with LTL tasks. The +approach demonstrates adaptability to various task complexities, though +challenges such as graph construction overhead and suboptimal path generation +remain. Future directions include extending to considering terrain conditions +and incorporating higher-order dynamics. + +
+
+ comment: 8 pages, 17 figures +
+
+
+
+
+ + ☆ Vid2Sim: Realistic and Interactive Simulation from Video for Urban + Navigation + + +
+ Sim-to-real gap has long posed a significant challenge for robot learning in +simulation, preventing the deployment of learned models in the real world. +Previous work has primarily focused on domain randomization and system +identification to mitigate this gap. However, these methods are often limited +by the inherent constraints of the simulation and graphics engines. In this +work, we propose Vid2Sim, a novel framework that effectively bridges the +sim2real gap through a scalable and cost-efficient real2sim pipeline for neural +3D scene reconstruction and simulation. Given a monocular video as input, +Vid2Sim can generate photorealistic and physically interactable 3D simulation +environments to enable the reinforcement learning of visual navigation agents +in complex urban environments. Extensive experiments demonstrate that Vid2Sim +significantly improves the performance of urban navigation in the digital twins +and real world by 31.2% and 68.3% in success rate compared with agents trained +with prior simulation methods. + +
+
+ comment: Project page: https://metadriverse.github.io/vid2sim/ +
+
+
+
+
+ + ☆ Application of Vision-Language Model to Pedestrians Behavior and Scene + Understanding in Autonomous Driving + + +
+ Autonomous driving (AD) has experienced significant improvements in recent +years and achieved promising 3D detection, classification, and localization +results. However, many challenges remain, e.g. semantic understanding of +pedestrians' behaviors, and downstream handling for pedestrian interactions. +Recent studies in applications of Large Language Models (LLM) and +Vision-Language Models (VLM) have achieved promising results in scene +understanding and high-level maneuver planning in diverse traffic scenarios. +However, deploying the billion-parameter LLMs to vehicles requires significant +computation and memory resources. In this paper, we analyzed effective +knowledge distillation of semantic labels to smaller Vision networks, which can +be used for the semantic representation of complex scenes for downstream +decision-making for planning and control. + +
+
+
+
+
+ + ♻ ☆ High-Sensitivity Vision-Based Tactile Sensing Enhanced by + Microstructures and Lightweight CNN + + +
+ Tactile sensing is critical in advanced interactive systems by emulating the +human sense of touch to detect stimuli. Vision-based tactile sensors (VBTSs) +are promising for their ability to provide rich information, robustness, +adaptability, low cost, and multimodal capabilities. However, current +technologies still have limitations in sensitivity, spatial resolution, and the +high computational demands of deep learning-based image processing. This paper +presents a comprehensive approach combining a novel sensor structure with +micromachined structures and an efficient image processing method, and +demonstrates that carefully engineered microstructures within the sensor +hardware can significantly enhance sensitivity while reducing computational +load. Unlike traditional designs with tracking markers, our sensor incorporates +an interface surface with micromachined trenches, as an example of +microstructures, which modulate light transmission and amplify the variation in +response to applied force. By capturing variations in brightness, wire width, +and cross pattern locations with a camera, the sensor accurately infers the +contact location, the magnitude of displacement and applied force with a +lightweight convolutional neural network (CNN). Theoretical and experimental +results demonstrated that the microstructures significantly enhance sensitivity +by amplifying the visual effects of shape distortion. The sensor system +effectively detected forces below 10 mN, and achieved a millimetre-level +single-point spatial resolution. Using a model with only one convolutional +layer, a mean absolute error (MAE) below 0.05 mm have been achieved. Its soft +sensor body ensures compatibility with soft robots and wearable electronics, +while its immunity to electrical crosstalk and interference guarantees +reliability in complex human-machine environments. + +
+
+ comment: 27 pages, 13 figures, 2 tables; rearranged figures; corrected typos +
+
+
+
+
+ + ♻ ☆ A Survey on Reinforcement Learning Applications in SLAM + + +
+ The emergence of mobile robotics, particularly in the automotive industry, +introduces a promising era of enriched user experiences and adept handling of +complex navigation challenges. The realization of these advancements +necessitates a focused technological effort and the successful execution of +numerous intricate tasks, particularly in the critical domain of Simultaneous +Localization and Mapping (SLAM). Various artificial intelligence (AI) +methodologies, such as deep learning and reinforcement learning, present viable +solutions to address the challenges in SLAM. This study specifically explores +the application of reinforcement learning in the context of SLAM. By enabling +the agent (the robot) to iteratively interact with and receive feedback from +its environment, reinforcement learning facilitates the acquisition of +navigation and mapping skills, thereby enhancing the robot's decision-making +capabilities. This approach offers several advantages, including improved +navigation proficiency, increased resilience, reduced dependence on sensor +precision, and refinement of the decision-making process. The findings of this +study, which provide an overview of reinforcement learning's utilization in +SLAM, reveal significant advancements in the field. The investigation also +highlights the evolution and innovative integration of these techniques. + +
+
+
+
+
+ + ♻ ☆ Beyond Sight: Finetuning Generalist Robot Policies with Heterogeneous + Sensors via Language Grounding + + +
+ Interacting with the world is a multi-sensory experience: achieving effective +general-purpose interaction requires making use of all available modalities -- +including vision, touch, and audio -- to fill in gaps from partial observation. +For example, when vision is occluded reaching into a bag, a robot should rely +on its senses of touch and sound. However, state-of-the-art generalist robot +policies are typically trained on large datasets to predict robot actions +solely from visual and proprioceptive observations. In this work, we propose +FuSe, a novel approach that enables finetuning visuomotor generalist policies +on heterogeneous sensor modalities for which large datasets are not readily +available by leveraging natural language as a common cross-modal grounding. We +combine a multimodal contrastive loss with a sensory-grounded language +generation loss to encode high-level semantics. In the context of robot +manipulation, we show that FuSe enables performing challenging tasks that +require reasoning jointly over modalities such as vision, touch, and sound in a +zero-shot setting, such as multimodal prompting, compositional cross-modal +prompting, and descriptions of objects it interacts with. We show that the same +recipe is applicable to widely different generalist policies, including both +diffusion-based generalist policies and large vision-language-action (VLA) +models. Extensive experiments in the real world show that FuSeis able to +increase success rates by over 20% compared to all considered baselines. + +
+
+
+
+
+ + ♻ ☆ An Accurate and Real-time Relative Pose Estimation from Triple + Point-line Images by Decoupling Rotation and Translation + + +
+ Line features are valid complements for point features in man-made +environments. 3D-2D constraints provided by line features have been widely used +in Visual Odometry (VO) and Structure-from-Motion (SfM) systems. However, how +to accurately solve three-view relative motion only with 2D observations of +points and lines in real time has not been fully explored. In this paper, we +propose a novel three-view pose solver based on rotation-translation decoupled +estimation. First, a high-precision rotation estimation method based on normal +vector coplanarity constraints that consider the uncertainty of observations is +proposed, which can be solved by Levenberg-Marquardt (LM) algorithm +efficiently. Second, a robust linear translation constraint that minimizes the +degree of the rotation components and feature observation components in +equations is elaborately designed for estimating translations accurately. +Experiments on synthetic data and real-world data show that the proposed +approach improves both rotation and translation accuracy compared to the +classical trifocal-tensor-based method and the state-of-the-art two-view +algorithm in outdoor and indoor environments. + +
+
+
+
+
+ + ♻ ☆ USV-AUV Collaboration Framework for Underwater Tasks under Extreme Sea + Conditions + + +
+ Autonomous underwater vehicles (AUVs) are valuable for ocean exploration due +to their flexibility and ability to carry communication and detection units. +Nevertheless, AUVs alone often face challenges in harsh and extreme sea +conditions. This study introduces a unmanned surface vehicle (USV)-AUV +collaboration framework, which includes high-precision multi-AUV positioning +using USV path planning via Fisher information matrix optimization and +reinforcement learning for multi-AUV cooperative tasks. Applied to a multi-AUV +underwater data collection task scenario, extensive simulations validate the +framework's feasibility and superior performance, highlighting exceptional +coordination and robustness under extreme sea conditions. To accelerate +relevant research in this field, we have made the simulation code (demo +version) available as open-source. + +
+
+
+
+
+ + ♻ ☆ Speedup Techniques for Switchable Temporal Plan Graph Optimization + + +
+ Multi-Agent Path Finding (MAPF) focuses on planning collision-free paths for +multiple agents. However, during the execution of a MAPF plan, agents may +encounter unexpected delays, which can lead to inefficiencies, deadlocks, or +even collisions. To address these issues, the Switchable Temporal Plan Graph +provides a framework for finding an acyclic Temporal Plan Graph with the +minimum execution cost under delays, ensuring deadlock- and collision-free +execution. Unfortunately, existing optimal algorithms, such as Mixed Integer +Linear Programming and Graph-Based Switchable Edge Search (GSES), are often too +slow for practical use. This paper introduces Improved GSES, which +significantly accelerates GSES through four speedup techniques: stronger +admissible heuristics, edge grouping, prioritized branching, and incremental +implementation. Experiments conducted on four different map types with varying +numbers of agents demonstrate that Improved GSES consistently achieves over +twice the success rate of GSES and delivers up to a 30-fold speedup on +instances where both methods successfully find solutions. + +
+
+ comment: Accepted by AAAI 2025. This version contains the appendix +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 24 + +
+
+
+ + ☆ Comparison of Autoencoders for tokenization of ASL datasets + + +
+ Generative AI, powered by large language models (LLMs), has revolutionized +applications across text, audio, images, and video. This study focuses on +developing and evaluating encoder-decoder architectures for the American Sign +Language (ASL) image dataset, consisting of 87,000 images across 29 hand sign +classes. Three approaches were compared: Feedforward Autoencoders, +Convolutional Autoencoders, and Diffusion Autoencoders. The Diffusion +Autoencoder outperformed the others, achieving the lowest mean squared error +(MSE) and highest Mean Opinion Score (MOS) due to its probabilistic noise +modeling and iterative denoising capabilities. The Convolutional Autoencoder +demonstrated effective spatial feature extraction but lacked the robustness of +the diffusion process, while the Feedforward Autoencoder served as a baseline +with limitations in handling complex image data. Objective and subjective +evaluations confirmed the superiority of the Diffusion Autoencoder for +high-fidelity image reconstruction, emphasizing its potential in multimodal AI +applications such as sign language recognition and generation. This work +provides critical insights into designing robust encoder-decoder systems to +advance multimodal AI capabilities. + +
+
+ comment: 9 pages, 2 tables, 4 figures +
+
+
+
+
+ + ☆ Super-Resolution of 3D Micro-CT Images Using Generative Adversarial + Networks: Enhancing Resolution and Segmentation Accuracy + + +
+ We develop a procedure for substantially improving the quality of segmented +3D micro-Computed Tomography (micro-CT) images of rocks with a Machine Learning +(ML) Generative Model. The proposed model enhances the resolution eightfold +(8x) and addresses segmentation inaccuracies due to the overlapping X-ray +attenuation in micro-CT measurement for different rock minerals and phases. The +proposed generative model is a 3D Deep Convolutional Wasserstein Generative +Adversarial Network with Gradient Penalty (3D DC WGAN-GP). The algorithm is +trained on segmented 3D low-resolution micro-CT images and segmented unpaired +complementary 2D high-resolution Laser Scanning Microscope (LSM) images. The +algorithm was demonstrated on multiple samples of Berea sandstones. We achieved +high-quality super-resolved 3D images with a resolution of 0.4375 micro-m/voxel +and accurate segmentation for constituting minerals and pore space. The +described procedure can significantly expand the modern capabilities of digital +rock physics. + +
+
+ comment: 24 pages, 9 figures +
+
+
+
+
+ + ☆ Evaluating unsupervised contrastive learning framework for MRI sequences + classification + + +
+ The automatic identification of Magnetic Resonance Imaging (MRI) sequences +can streamline clinical workflows by reducing the time radiologists spend +manually sorting and identifying sequences, thereby enabling faster diagnosis +and treatment planning for patients. However, the lack of standardization in +the parameters of MRI scans poses challenges for automated systems and +complicates the generation and utilization of datasets for machine learning +research. To address this issue, we propose a system for MRI sequence +identification using an unsupervised contrastive deep learning framework. By +training a convolutional neural network based on the ResNet-18 architecture, +our system classifies nine common MRI sequence types as a 9-class +classification problem. The network was trained using an in-house internal +dataset and validated on several public datasets, including BraTS, ADNI, Fused +Radiology-Pathology Prostate Dataset, the Breast Cancer Dataset (ACRIN), among +others, encompassing diverse acquisition protocols and requiring only 2D slices +for training. Our system achieves a classification accuracy of over 0.95 across +the nine most common MRI sequence types. + +
+
+
+
+
+ + ☆ CULTURE3D: Cultural Landmarks and Terrain Dataset for 3D Applications + + +
+ In this paper, we present a large-scale fine-grained dataset using +high-resolution images captured from locations worldwide. Compared to existing +datasets, our dataset offers a significantly larger size and includes a higher +level of detail, making it uniquely suited for fine-grained 3D applications. +Notably, our dataset is built using drone-captured aerial imagery, which +provides a more accurate perspective for capturing real-world site layouts and +architectural structures. By reconstructing environments with these detailed +images, our dataset supports applications such as the COLMAP format for +Gaussian Splatting and the Structure-from-Motion (SfM) method. It is compatible +with widely-used techniques including SLAM, Multi-View Stereo, and Neural +Radiance Fields (NeRF), enabling accurate 3D reconstructions and point clouds. +This makes it a benchmark for reconstruction and segmentation tasks. The +dataset enables seamless integration with multi-modal data, supporting a range +of 3D applications, from architectural reconstruction to virtual tourism. Its +flexibility promotes innovation, facilitating breakthroughs in 3D modeling and +analysis. + +
+
+
+
+
+ + ☆ Benchmarking YOLOv8 for Optimal Crack Detection in Civil Infrastructure + + +
+ Ensuring the structural integrity and safety of bridges is crucial for the +reliability of transportation networks and public safety. Traditional crack +detection methods are increasingly being supplemented or replaced by advanced +artificial intelligence (AI) techniques. However, most of the models rely on +two-stage target detection algorithms, which pose concerns for real-time +applications due to their lower speed. While models such as YOLO (You Only Look +Once) have emerged as transformative tools due to their remarkable speed and +accuracy. However, the potential of the latest YOLOv8 framework in this domain +remains underexplored. This study bridges that gap by rigorously evaluating +YOLOv8's performance across five model scales (nano, small, medium, large, and +extra-large) using a high-quality Roboflow dataset. A comprehensive +hyperparameter optimization was performed, testing six state-of-the-art +optimizers-Stochastic Gradient Descent, Adaptive Moment Estimation, Adam with +Decoupled Weight Decay, Root Mean Square Propagation, Rectified Adam, and +Nesterov-accelerated Adam. Results revealed that YOLOv8, optimized with +Stochastic Gradient Descent, delivered exceptional accuracy and speed, setting +a new benchmark for real-time crack detection. Beyond its immediate +application, this research positions YOLOv8 as a foundational approach for +integrating advanced computer vision techniques into infrastructure monitoring. +By enabling more reliable and proactive maintenance of aging bridge networks, +this work paves the way for safer, more efficient transportation systems +worldwide. + +
+
+ comment: Accepted at 104th TRB Annual Meeting 2025 +
+
+
+
+
+ + ☆ Driver Age and Its Effect on Key Driving Metrics: Insights from Dynamic + Vehicle Data + + +
+ By 2030, the senior population aged 65 and older is expected to increase by +over 50%, significantly raising the number of older drivers on the road. +Drivers over 70 face higher crash death rates compared to those in their +forties and fifties, underscoring the importance of developing more effective +safety interventions for this demographic. Although the impact of aging on +driving behavior has been studied, there is limited research on how these +behaviors translate into real-world driving scenarios. This study addresses +this need by leveraging Naturalistic Driving Data (NDD) to analyze driving +performance measures - specifically, speed limit adherence on interstates and +deceleration at stop intersections, both of which may be influenced by +age-related declines. Using NDD, we developed Cumulative Distribution Functions +(CDFs) to establish benchmarks for key driving behaviors among senior and young +drivers. Our analysis, which included anomaly detection, benchmark comparisons, +and accuracy evaluations, revealed significant differences in driving patterns +primarily related to speed limit adherence at 75mph. While our approach shows +promising potential for enhancing Advanced Driver Assistance Systems (ADAS) by +providing tailored interventions based on age-specific adherence to speed limit +driving patterns, we recognize the need for additional data to refine and +validate metrics for other driving behaviors. By establishing precise +benchmarks for various driving performance metrics, ADAS can effectively +identify anomalies, such as abrupt deceleration, which may indicate impaired +driving or other safety concerns. This study lays a strong foundation for +future research aimed at improving safety interventions through detailed +driving behavior analysis. + +
+
+ comment: 21 pages, 9 figures, 4 Tables, 104th TRB Annual Meeting 2025, + Washington DC +
+
+
+
+
+ + ☆ Local Foreground Selection aware Attentive Feature Reconstruction for + few-shot fine-grained plant species classification + + +
+ Plant species exhibit significant intra-class variation and minimal +inter-class variation. To enhance classification accuracy, it is essential to +reduce intra-class variation while maximizing inter-class variation. This paper +addresses plant species classification using a limited number of labelled +samples and introduces a novel Local Foreground Selection(LFS) attention +mechanism. LFS is a straightforward module designed to generate discriminative +support and query feature maps. It operates by integrating two types of +attention: local attention, which captures local spatial details to enhance +feature discrimination and increase inter-class differentiation, and foreground +selection attention, which emphasizes the foreground plant object while +mitigating background interference. By focusing on the foreground, the query +and support features selectively highlight relevant feature sequences and +disregard less significant background sequences, thereby reducing intra-class +differences. Experimental results from three plant species datasets demonstrate +the effectiveness of the proposed LFS attention mechanism and its complementary +advantages over previous feature reconstruction methods. + +
+
+
+
+
+ + ☆ Synthetic Prior for Few-Shot Drivable Head Avatar Inversion + + +
+ We present SynShot, a novel method for the few-shot inversion of a drivable +head avatar based on a synthetic prior. We tackle two major challenges. First, +training a controllable 3D generative network requires a large number of +diverse sequences, for which pairs of images and high-quality tracked meshes +are not always available. Second, state-of-the-art monocular avatar models +struggle to generalize to new views and expressions, lacking a strong prior and +often overfitting to a specific viewpoint distribution. Inspired by machine +learning models trained solely on synthetic data, we propose a method that +learns a prior model from a large dataset of synthetic heads with diverse +identities, expressions, and viewpoints. With few input images, SynShot +fine-tunes the pretrained synthetic prior to bridge the domain gap, modeling a +photorealistic head avatar that generalizes to novel expressions and +viewpoints. We model the head avatar using 3D Gaussian splatting and a +convolutional encoder-decoder that outputs Gaussian parameters in UV texture +space. To account for the different modeling complexities over parts of the +head (e.g., skin vs hair), we embed the prior with explicit control for +upsampling the number of per-part primitives. Compared to state-of-the-art +monocular methods that require thousands of real training images, SynShot +significantly improves novel view and expression synthesis. + +
+
+ comment: Website https://zielon.github.io/synshot/ +
+
+
+
+
+ + ☆ ActiveGAMER: Active GAussian Mapping through Efficient Rendering + + +
+ We introduce ActiveGAMER, an active mapping system that utilizes 3D Gaussian +Splatting (3DGS) to achieve high-quality, real-time scene mapping and +exploration. Unlike traditional NeRF-based methods, which are computationally +demanding and restrict active mapping performance, our approach leverages the +efficient rendering capabilities of 3DGS, allowing effective and efficient +exploration in complex environments. The core of our system is a +rendering-based information gain module that dynamically identifies the most +informative viewpoints for next-best-view planning, enhancing both geometric +and photometric reconstruction accuracy. ActiveGAMER also integrates a +carefully balanced framework, combining coarse-to-fine exploration, +post-refinement, and a global-local keyframe selection strategy to maximize +reconstruction completeness and fidelity. Our system autonomously explores and +reconstructs environments with state-of-the-art geometric and photometric +accuracy and completeness, significantly surpassing existing approaches in both +aspects. Extensive evaluations on benchmark datasets such as Replica and MP3D +highlight ActiveGAMER's effectiveness in active mapping tasks. + +
+
+
+
+
+ + ☆ MedGrad E-CLIP: Enhancing Trust and Transparency in AI-Driven Skin + Lesion Diagnosis + + +
+ As deep learning models gain attraction in medical data, ensuring transparent +and trustworthy decision-making is essential. In skin cancer diagnosis, while +advancements in lesion detection and classification have improved accuracy, the +black-box nature of these methods poses challenges in understanding their +decision processes, leading to trust issues among physicians. This study +leverages the CLIP (Contrastive Language-Image Pretraining) model, trained on +different skin lesion datasets, to capture meaningful relationships between +visual features and diagnostic criteria terms. To further enhance transparency, +we propose a method called MedGrad E-CLIP, which builds on gradient-based +E-CLIP by incorporating a weighted entropy mechanism designed for complex +medical imaging like skin lesions. This approach highlights critical image +regions linked to specific diagnostic descriptions. The developed integrated +pipeline not only classifies skin lesions by matching corresponding +descriptions but also adds an essential layer of explainability developed +especially for medical data. By visually explaining how different features in +an image relates to diagnostic criteria, this approach demonstrates the +potential of advanced vision-language models in medical image analysis, +ultimately improving transparency, robustness, and trust in AI-driven +diagnostic systems. + +
+
+ comment: Accepted to 2025 IEEE/CVF Winter Conference on Applications of + Computer Vision Workshops (WACVW) +
+
+
+
+
+ + ☆ Transforming Vision Transformer: Towards Efficient Multi-Task + Asynchronous Learning NeurIPS 2024 + + +
+ Multi-Task Learning (MTL) for Vision Transformer aims at enhancing the model +capability by tackling multiple tasks simultaneously. Most recent works have +predominantly focused on designing Mixture-of-Experts (MoE) structures and in +tegrating Low-Rank Adaptation (LoRA) to efficiently perform multi-task +learning. However, their rigid combination hampers both the optimization of MoE +and the ef fectiveness of reparameterization of LoRA, leading to sub-optimal +performance and low inference speed. In this work, we propose a novel approach +dubbed Efficient Multi-Task Learning (EMTAL) by transforming a pre-trained +Vision Transformer into an efficient multi-task learner during training, and +reparameterizing the learned structure for efficient inference. Specifically, +we firstly develop the MoEfied LoRA structure, which decomposes the pre-trained +Transformer into a low-rank MoE structure and employ LoRA to fine-tune the +parameters. Subsequently, we take into account the intrinsic asynchronous +nature of multi-task learning and devise a learning Quality Retaining (QR) +optimization mechanism, by leveraging the historical high-quality class logits +to prevent a well-trained task from performance degradation. Finally, we design +a router fading strategy to integrate the learned parameters into the original +Transformer, archiving efficient inference. Extensive experiments on public +benchmarks demonstrate the superiority of our method, compared to the +state-of-the-art multi-task learning approaches. + +
+
+ comment: Accepted by the 38th Conference on Neural Information Processing + Systems (NeurIPS 2024) +
+
+
+
+
+ + ☆ Real-Time Neural-Enhancement for Online Cloud Gaming + + +
+ Online Cloud gaming demands real-time, high-quality video transmission across +variable wide-area networks (WANs). Neural-enhanced video transmission +algorithms employing super-resolution (SR) for video quality enhancement have +effectively challenged WAN environments. However, these SR-based methods +require intensive fine-tuning for the whole video, making it infeasible in +diverse online cloud gaming. To address this, we introduce River, a cloud +gaming delivery framework designed based on the observation that video segment +features in cloud gaming are typically repetitive and redundant. This permits a +significant opportunity to reuse fine-tuned SR models, reducing the fine-tuning +latency of minutes to query latency of milliseconds. To enable the idea, we +design a practical system that addresses several challenges, such as model +organization, online model scheduler, and transfer strategy. River first builds +a content-aware encoder that fine-tunes SR models for diverse video segments +and stores them in a lookup table. When delivering cloud gaming video streams +online, River checks the video features and retrieves the most relevant SR +models to enhance the frame quality. Meanwhile, if no existing SR model +performs well enough for some video segments, River will further fine-tune new +models and update the lookup table. Finally, to avoid the overhead of streaming +model weight to the clients, River designs a prefetching strategy that predicts +the models with the highest possibility of being retrieved. Our evaluation +based on real video game streaming demonstrates River can reduce redundant +training overhead by 44% and improve the Peak-Signal-to-Noise-Ratio by 1.81dB +compared to the SOTA solutions. Practical deployment shows River meets +real-time requirements, achieving approximately 720p 20fps on mobile devices. + +
+
+
+
+
+ + ☆ Defect Detection Network In PCB Circuit Devices Based on GAN Enhanced + YOLOv11 + + +
+ This study proposes an advanced method for surface defect detection in +printed circuit boards (PCBs) using an improved YOLOv11 model enhanced with a +generative adversarial network (GAN). The approach focuses on identifying six +common defect types: missing hole, rat bite, open circuit, short circuit, burr, +and virtual welding. By employing GAN to generate synthetic defect images, the +dataset is augmented with diverse and realistic patterns, improving the model's +ability to generalize, particularly for complex and infrequent defects like +burrs. The enhanced YOLOv11 model is evaluated on a PCB defect dataset, +demonstrating significant improvements in accuracy, recall, and robustness, +especially when dealing with defects in complex environments or small targets. +This research contributes to the broader field of electronic design automation +(EDA), where efficient defect detection is a crucial step in ensuring +high-quality PCB manufacturing. By integrating advanced deep learning +techniques, this approach enhances the automation and precision of defect +detection, reducing reliance on manual inspection and accelerating +design-to-production workflows. The findings underscore the importance of +incorporating GAN-based data augmentation and optimized detection architectures +in EDA processes, providing valuable insights for improving reliability and +efficiency in PCB defect detection within industrial applications. + +
+
+
+
+
+ + ☆ Uncertainty-Aware Online Extrinsic Calibration: A Conformal Prediction + Approach + + +
+ Accurate sensor calibration is crucial for autonomous systems, yet its +uncertainty quantification remains underexplored. We present the first approach +to integrate uncertainty awareness into online extrinsic calibration, combining +Monte Carlo Dropout with Conformal Prediction to generate prediction intervals +with a guaranteed level of coverage. Our method proposes a framework to enhance +existing calibration models with uncertainty quantification, compatible with +various network architectures. Validated on KITTI (RGB Camera-LiDAR) and DSEC +(Event Camera-LiDAR) datasets, we demonstrate effectiveness across different +visual sensor types, measuring performance with adapted metrics to evaluate the +efficiency and reliability of the intervals. By providing calibration +parameters with quantifiable confidence measures, we offer insights into the +reliability of calibration estimates, which can greatly improve the robustness +of sensor fusion in dynamic environments and usefully serve the Computer Vision +community. + +
+
+ comment: Accepted for publication at WACV 2025 +
+
+
+
+
+ + ☆ A Foundational Generative Model for Breast Ultrasound Image Analysis + + +
+ Foundational models have emerged as powerful tools for addressing various +tasks in clinical settings. However, their potential development to breast +ultrasound analysis remains untapped. In this paper, we present BUSGen, the +first foundational generative model specifically designed for breast ultrasound +image analysis. Pretrained on over 3.5 million breast ultrasound images, BUSGen +has acquired extensive knowledge of breast structures, pathological features, +and clinical variations. With few-shot adaptation, BUSGen can generate +repositories of realistic and informative task-specific data, facilitating the +development of models for a wide range of downstream tasks. Extensive +experiments highlight BUSGen's exceptional adaptability, significantly +exceeding real-data-trained foundational models in breast cancer screening, +diagnosis, and prognosis. In breast cancer early diagnosis, our approach +outperformed all board-certified radiologists (n=9), achieving an average +sensitivity improvement of 16.5% (P-value<0.0001). Additionally, we +characterized the scaling effect of using generated data which was as effective +as the collected real-world data for training diagnostic models. Moreover, +extensive experiments demonstrated that our approach improved the +generalization ability of downstream models. Importantly, BUSGen protected +patient privacy by enabling fully de-identified data sharing, making progress +forward in secure medical data utilization. An online demo of BUSGen is +available at https://aibus.bio. + +
+
+ comment: Peking University; Stanford University; Peking University Cancer + Hospital & Institute; Peking Union Medical College Hospital; Cancer Hospital, + Chinese Academy of Medical Sciences +
+
+
+
+
+ + ☆ LarvSeg: Exploring Image Classification Data For Large Vocabulary + Semantic Segmentation via Category-wise Attentive Classifier + + +
+ Scaling up the vocabulary of semantic segmentation models is extremely +challenging because annotating large-scale mask labels is labour-intensive and +time-consuming. Recently, language-guided segmentation models have been +proposed to address this challenge. However, their performance drops +significantly when applied to out-of-distribution categories. In this paper, we +propose a new large vocabulary semantic segmentation framework, called LarvSeg. +Different from previous works, LarvSeg leverages image classification data to +scale the vocabulary of semantic segmentation models as large-vocabulary +classification datasets usually contain balanced categories and are much easier +to obtain. However, for classification tasks, the category is image-level, +while for segmentation we need to predict the label at pixel level. To address +this issue, we first propose a general baseline framework to incorporate +image-level supervision into the training process of a pixel-level segmentation +model, making the trained network perform semantic segmentation on newly +introduced categories in the classification data. We then observe that a model +trained on segmentation data can group pixel features of categories beyond the +training vocabulary. Inspired by this finding, we design a category-wise +attentive classifier to apply supervision to the precise regions of +corresponding categories to improve the model performance. Extensive +experiments demonstrate that LarvSeg significantly improves the large +vocabulary semantic segmentation performance, especially in the categories +without mask labels. For the first time, we provide a 21K-category semantic +segmentation model with the help of ImageNet21K. The code is available at +https://github.com/HaojunYu1998/large_voc_seg. + +
+
+ comment: PRCV 2024 +
+
+
+
+
+ + ☆ A General Framework for Inference-time Scaling and Steering of Diffusion + Models + + +
+ Diffusion models produce impressive results in modalities ranging from images +and video to protein design and text. However, generating samples with +user-specified properties remains a challenge. Recent research proposes +fine-tuning models to maximize rewards that capture desired properties, but +these methods require expensive training and are prone to mode collapse. In +this work, we propose Feynman Kac (FK) steering, an inference-time framework +for steering diffusion models with reward functions. FK steering works by +sampling a system of multiple interacting diffusion processes, called +particles, and resampling particles at intermediate steps based on scores +computed using functions called potentials. Potentials are defined using +rewards for intermediate states and are selected such that a high value +indicates that the particle will yield a high-reward sample. We explore various +choices of potentials, intermediate rewards, and samplers. We evaluate FK +steering on text-to-image and text diffusion models. For steering text-to-image +models with a human preference reward, we find that FK steering a 0.8B +parameter model outperforms a 2.6B parameter fine-tuned model on prompt +fidelity, with faster sampling and no training. For steering text diffusion +models with rewards for text quality and specific text attributes, we find that +FK steering generates lower perplexity, more linguistically acceptable outputs +and enables gradient-free control of attributes like toxicity. Our results +demonstrate that inference-time scaling and steering of diffusion models, even +with off-the-shelf rewards, can provide significant sample quality gains and +controllability benefits. Code is available at +https://github.com/zacharyhorvitz/Fk-Diffusion-Steering . + +
+
+
+
+
+ + ☆ Faithful Counterfactual Visual Explanations (FCVE) + + +
+ Deep learning models in computer vision have made remarkable progress, but +their lack of transparency and interpretability remains a challenge. The +development of explainable AI can enhance the understanding and performance of +these models. However, existing techniques often struggle to provide convincing +explanations that non-experts easily understand, and they cannot accurately +identify models' intrinsic decision-making processes. To address these +challenges, we propose to develop a counterfactual explanation (CE) model that +balances plausibility and faithfulness. This model generates easy-to-understand +visual explanations by making minimum changes necessary in images without +altering the pixel data. Instead, the proposed method identifies internal +concepts and filters learned by models and leverages them to produce plausible +counterfactual explanations. The provided explanations reflect the internal +decision-making process of the model, thus ensuring faithfulness to the model. + +
+
+
+
+
+ + ♻ ☆ DoubleDiffusion: Combining Heat Diffusion with Denoising Diffusion for + Generative Learning on 3D Meshes + + +
+ This paper proposes DoubleDiffusion, a novel framework that combines heat +dissipation diffusion and denoising diffusion for direct generative learning on +3D mesh surfaces. Our approach addresses the challenges of generating +continuous signal distributions residing on a curve manifold surface. Unlike +previous methods that rely on unrolling 3D meshes into 2D or adopting field +representations, DoubleDiffusion leverages the Laplacian-Beltrami operator to +process features respecting the mesh structure. This combination enables +effective geometry-aware signal diffusion across the underlying geometry. As +shown in Fig.1, we demonstrate that DoubleDiffusion has the ability to generate +RGB signal distributions on complex 3D mesh surfaces and achieves per-category +shape-conditioned texture generation across different shape geometry. Our work +contributes a new direction in diffusion-based generative modeling on 3D +surfaces, with potential applications in the field of 3D asset generation. + +
+
+ comment: Codes: https://github.com/Wxyxixixi/DoubleDiffusion_3D_Mesh +
+
+
+
+
+ + ♻ ☆ Artificial Intelligence for Cochlear Implants: Review of Strategies, + Challenges, and Perspectives + + +
+ Automatic speech recognition (ASR) plays a pivotal role in our daily lives, +offering utility not only for interacting with machines but also for +facilitating communication for individuals with partial or profound hearing +impairments. The process involves receiving the speech signal in analog form, +followed by various signal processing algorithms to make it compatible with +devices of limited capacities, such as cochlear implants (CIs). Unfortunately, +these implants, equipped with a finite number of electrodes, often result in +speech distortion during synthesis. Despite efforts by researchers to enhance +received speech quality using various state-of-the-art (SOTA) signal processing +techniques, challenges persist, especially in scenarios involving multiple +sources of speech, environmental noise, and other adverse conditions. The +advent of new artificial intelligence (AI) methods has ushered in cutting-edge +strategies to address the limitations and difficulties associated with +traditional signal processing techniques dedicated to CIs. This review aims to +comprehensively cover advancements in CI-based ASR and speech enhancement, +among other related aspects. The primary objective is to provide a thorough +overview of metrics and datasets, exploring the capabilities of AI algorithms +in this biomedical field, and summarizing and commenting on the best results +obtained. Additionally, the review will delve into potential applications and +suggest future directions to bridge existing research gaps in this domain. + +
+
+
+
+
+ + ♻ ☆ Exploring Superpixel Segmentation Methods in the Context of Citizen + Science and Deforestation Detection + + +
+ Tropical forests play an essential role in the planet's ecosystem, making the +conservation of these biomes a worldwide priority. However, ongoing +deforestation and degradation pose a significant threat to their existence, +necessitating effective monitoring and the proposal of actions to mitigate the +damage caused by these processes. In this regard, initiatives range from +government and private sector monitoring programs to solutions based on citizen +science campaigns, for example. Particularly in the context of citizen science +campaigns, the segmentation of remote sensing images to identify deforested +areas and subsequently submit them to analysis by non-specialized volunteers is +necessary. Thus, segmentation using superpixel-based techniques proves to be a +viable solution for this important task. Therefore, this paper presents an +analysis of 22 superpixel-based segmentation methods applied to remote sensing +images, aiming to identify which of them are more suitable for generating +segments for citizen science campaigns. The results reveal that seven of the +segmentation methods outperformed the baseline method (SLIC) currently employed +in the ForestEyes citizen science project, indicating an opportunity for +improvement in this important stage of campaign development. + +
+
+ comment: This paper is under review +
+
+
+
+
+ + ♻ ☆ Fresh-CL: Feature Realignment through Experts on Hypersphere in + Continual Learning + + +
+ Continual Learning enables models to learn and adapt to new tasks while +retaining prior knowledge. Introducing new tasks, however, can naturally lead +to feature entanglement across tasks, limiting the model's capability to +distinguish between new domain data. In this work, we propose a method called +Feature Realignment through Experts on hyperSpHere in Continual Learning +(Fresh-CL). By leveraging predefined and fixed simplex equiangular tight frame +(ETF) classifiers on a hypersphere, our model improves feature separation both +intra and inter tasks. However, the projection to a simplex ETF shifts with new +tasks, disrupting structured feature representation of previous tasks and +degrading performance. Therefore, we propose a dynamic extension of ETF through +mixture of experts, enabling adaptive projections onto diverse subspaces to +enhance feature representation. Experiments on 11 datasets demonstrate a 2% +improvement in accuracy compared to the strongest baseline, particularly in +fine-grained datasets, confirming the efficacy of combining ETF and MoE to +improve feature distinction in continual learning scenarios. + +
+
+ comment: Accepted by ICASSP 2025 +
+
+
+
+
+ + ♻ ☆ Mitigating Low-Frequency Bias: Feature Recalibration and Frequency + Attention Regularization for Adversarial Robustness + + +
+ Ensuring the robustness of deep neural networks against adversarial attacks +remains a fundamental challenge in computer vision. While adversarial training +(AT) has emerged as a promising defense strategy, our analysis reveals a +critical limitation: AT-trained models exhibit a bias toward low-frequency +features while neglecting high-frequency components. This bias is particularly +concerning as each frequency component carries distinct and crucial +information: low-frequency features encode fundamental structural patterns, +while high-frequency features capture intricate details and textures. To +address this limitation, we propose High-Frequency Feature Disentanglement and +Recalibration (HFDR), a novel module that strategically separates and +recalibrates frequency-specific features to capture latent semantic cues. We +further introduce frequency attention regularization to harmonize feature +extraction across the frequency spectrum and mitigate the inherent +low-frequency bias of AT. Extensive experiments demonstrate our method's +superior performance against white-box attacks and transfer attacks, while +exhibiting strong generalization capabilities across diverse scenarios. + +
+
+
+
+
+ + ♻ ☆ SELMA3D challenge: Self-supervised learning for 3D light-sheet + microscopy image segmentation + + +
+ Recent innovations in light sheet microscopy, paired with developments in +tissue clearing techniques, enable the 3D imaging of large mammalian tissues +with cellular resolution. Combined with the progress in large-scale data +analysis, driven by deep learning, these innovations empower researchers to +rapidly investigate the morphological and functional properties of diverse +biological samples. Segmentation, a crucial preliminary step in the analysis +process, can be automated using domain-specific deep learning models with +expert-level performance. However, these models exhibit high sensitivity to +domain shifts, leading to a significant drop in accuracy when applied to data +outside their training distribution. To address this limitation, and inspired +by the recent success of self-supervised learning in training generalizable +models, we organized the SELMA3D Challenge during the MICCAI 2024 conference. +SELMA3D provides a vast collection of light-sheet images from cleared mice and +human brains, comprising 35 large 3D images-each with over 1000^3 voxels-and +315 annotated small patches for finetuning, preliminary testing and final +testing. The dataset encompasses diverse biological structures, including +vessel-like and spot-like structures. Five teams participated in all phases of +the challenge, and their proposed methods are reviewed in this paper. +Quantitative and qualitative results from most participating teams demonstrate +that self-supervised learning on large datasets improves segmentation model +performance and generalization. We will continue to support and extend SELMA3D +as an inaugural MICCAI challenge focused on self-supervised learning for 3D +microscopy image segmentation. + +
+
+ comment: 2st version +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 0000000..69f5da7 --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`